In [1]:
import pandas as pd
import csv
import glob

In [3]:
#!pip install requests
#!pip install textblob
#!pip install vaderSentiment
#!pip install tweet-preprocessor
#!pip install emoji
#!pip install seaborn

In [2]:
import os

In [72]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
import emoji
import preprocessor as p
import re
import math
import json
import ast
AFINN_PATH = '../sentiment/AFINN-111.txt'
HEDONO_PATH = "../sentiment/Data_Set_S1.txt"
TEXT_FIELD = 'pure_text'
p.set_options(p.OPT.URL, p.OPT.EMOJI)
str_weather_terms = '''aerovane air airstream altocumulus altostratus anemometer anemometers anticyclone anticyclones \
arctic arid aridity atmosphere atmospheric autumn autumnal balmy baroclinic barometer barometers \
barometric blizzard blizzards blustering blustery blustery breeze breezes breezy brisk calm \
celsius chill chilled chillier chilliest chilly chinook cirrocumulus cirrostratus cirrus climate climates \
cloud cloudburst cloudbursts cloudier cloudiest clouds cloudy cold colder coldest condensation \
contrail contrails cool cooled cooling cools cumulonimbus cumulus cyclone cyclones damp damp \
damper damper dampest dampest degree degrees deluge dew dews dewy doppler downburst \
downbursts downdraft downdrafts downpour downpours dried drier dries driest drizzle drizzled \
drizzles drizzly drought droughts dry dryline fall farenheit flood flooded flooding floods flurries \
flurry fog fogbow fogbows fogged fogging foggy fogs forecast forecasted forecasting forecasts freeze \
freezes freezing frigid frost frostier frostiest frosts frosty froze frozen gale gales galoshes gust \
gusting gusts gusty haboob haboobs hail hailed hailing hails haze hazes hazy heat heated heating \
heats hoarfrost hot hotter hottest humid humidity hurricane hurricanes ice iced ices icing icy \
inclement landspout landspouts lightning lightnings macroburst macrobursts maelstrom mercury \
meteorologic meteorologist meteorologists meteorology microburst microbursts microclimate \
microclimates millibar millibars mist misted mists misty moist moisture monsoon monsoons \
mugginess muggy nexrad nippy NOAA nor’easter nor’easters noreaster noreasters overcast ozone \
parched parching pollen precipitate precipitated precipitates precipitating precipitation psychrometer \
radar rain rainboots rainbow rainbows raincoat raincoats rained rainfall rainier rainiest \
raining rains rainy sandstorm sandstorms scorcher scorching searing shower showering showers \
skiff sleet slicker slickers slush slushy smog smoggier smoggiest smoggy snow snowed snowier \
snowiest snowing snowmageddon snowpocalypse snows snowy spring sprinkle sprinkles sprinkling \
squall squalls squally storm stormed stormier stormiest storming storms stormy stratocumulus \
stratus subtropical summer summery sun sunnier sunniest sunny temperate temperature tempest \
thaw thawed thawing thaws thermometer thunder thundered thundering thunders thunderstorm \
thunderstorms tornadic tornado tornadoes tropical troposphere tsunami turbulent twister twisters \
typhoon typhoons umbrella umbrellas vane warm warmed warming warms warmth waterspout \
waterspouts weather wet wetter wettest wind windchill windchills windier windiest windspeed \
windy winter wintery wintry'''
LST_WEATHER_TERMS = str_weather_terms.split(' ')
DICT_WEATHER_TERMS = {LST_WEATHER_TERMS[i]: 1 for i in range(len(LST_WEATHER_TERMS))}

def CheckWeatherTerm(text):
    '''
    Return 1 or 0 for whether input contains any weather term
    '''
    words = nltk.word_tokenize(text)
    for w in words:
        if w in DICT_WEATHER_TERMS:
            return 1
    return 0
#https://github.com/s/preprocessor/issues/50
class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException

def keepemoji_clean(text):
    '''
    clean text with tweet-preprocessor
    '''
    text = emoji.demojize(text)
    import signal
    signal.signal(signal.SIGALRM, timeout_handler)
    signal.alarm(2)
    try:
        r = p.clean(text)
    except TimeoutException:
        print(f"Could not handle the {text}")
        r = text
    else:
        signal.alarm(0)

    return r


In [4]:

def afinn_sentiment(text,afinn,pattern_split):
    """
    Returns a float for sentiment strength based on the input text.
    Positive values are positive valence, negative value are negative valence. 
    """
    words = pattern_split.split(text.lower())
    sentiments = list(map(lambda word: afinn.get(word, 0), words))
    leng = len(sentiments)
    if leng > 0:
        # How should you weight the individual word sentiments? 
        # You could do N, sqrt(N) or 1 for example. Here I use sqrt(N)
        sentiment = float(sum(list(sentiments)))/math.sqrt(len(list(sentiments)))
    else:
        sentiment = 0
    return sentiment

def add_afinn(df):
    filenameAFINN = AFINN_PATH
    afinn = dict(map(lambda ws: (ws[0], int(ws[1])), [ 
            ws.strip().split('\t') for ws in open(filenameAFINN) ]))
    pattern_split = re.compile(r"\W+")
    df['afinn'] = df[TEXT_FIELD].map(lambda x:afinn_sentiment(x,afinn,pattern_split))
    return df

In [5]:
def load_scores(filename):
    """Takes a file from the Dodd research paper and returns a dict of
    wordscores. Note this function is tailored to the file provided
    by the Dodd paper. For other sets of word scores, a dict can be
    passed directly to HMeter."""
    doddfile = csv.reader(open(filename, "r"), delimiter='\t')
    for x in range(4):  # strip header info
        next(doddfile)
    return {row[0]: float(row[2]) for row in doddfile}

class HMeter(object):
    """HMeter is the main class to prepare a text sample for scores. It
    expects a list of individual words, such as those provided by 
    nltk.word_tokenize, as wordlist. It expects a dict of words as k and
    floating point wordscores as v for wordscores. deltah allows us to 
    filter out the most neutral words as stop words."""
    def __init__(self, wordlist, wordscores, deltah=0.0):
        self.wordlist = wordlist
        self.wordscores = wordscores
        self.deltah = deltah
    _deltah = None
    @property
    def deltah(self):
        """Deltah determines stop words. The higher deltah the more neutral 
        words are are discarded from the matchlist."""
        return self._deltah
    @deltah.setter
    def deltah(self, deltah):
        """Each time deltah is set we need to regenerate the matchlist."""
        self._deltah = deltah
        # TODO Should probably raise a range error if deltah is nonsensical
        # first we take every word that matches labMT 1.0
        labmtmatches = (word for word in self.wordlist
                        if word in self.wordscores)
        # then we strip out stop words as described by Dodd paper
        self.matchlist = []
        for word in labmtmatches:
            score = self.wordscores[word]
            if score >= 5.0 + self.deltah or score <= 5.0 - self.deltah:
                self.matchlist.append(word)
    def fractional_abundance(self, word):
        """Takes a word and return its fractional abundance within
        self.matchlist"""
        frac_abund = self.matchlist.count(word) / len(self.matchlist)
        return frac_abund
    def word_shift(self, comp):
        """Produces data necessary to create a word shift graph. Returns a list 
        of tuples that contain each word's contribution to happiness score shift 
        between two samples. So for example, assigned to a variable 'output_data'
        output_data[n] represents the data for one word where:
            
        output_data[n][0] the word
        output_data[n][1] the proportional contribution the word gives to overall
                          word shift
        output_data[n][2] The relative abundance of word between the two samples
        output_data[n][3] The word's happiness relative to the refernce sample
        
        Using this data, we can construct word shift graphs as described here:
        http://www.hedonometer.org/shifts.html"""
        # initialize variables for potentially large loop.
        # create our comparison object. self is the reference object.
        tcomp = HMeter(comp, self.deltah)
        # we want a list of all potential words, but only need each word once.
        word_shift_list = set(tcomp.matchlist + self.matchlist)
        output_data = []
        ref_happiness_score = self.happiness_score()
        comp_happiness_score = tcomp.happiness_score()
        happy_diff = comp_happiness_score - ref_happiness_score
        for word in word_shift_list:
            abundance = (tcomp.fractional_abundance(word) -
                         self.fractional_abundance(word))
            happiness_shift = self.wordscores[word] - ref_happiness_score
            paper_score = (happiness_shift * abundance * 100) / happy_diff
            output_data.append((word, paper_score, abundance, happiness_shift))
        # sort words by absolute value of individual word shift
        output_data.sort(key=lambda word: abs(word[1]))
        return output_data
    def happiness_score(self):
        """Takes a list made up of individual words and returns the happiness
        score."""
        happysum = 0
        count = len(self.matchlist)
        for word in self.matchlist:
            happysum += self.wordscores[word]
        if count != 0:  # divide by zero errors are sad.
            return happysum / count
        else:
            pass  # empty lists have no score


In [6]:


def hmeter_sentiment(text,pattern_split,scores):
    """
    Returns a float for sentiment strength based on the input text.
    Positive values are positive valence, negative value are negative valence. 
    """
    words = pattern_split.split(text.lower())
    h = HMeter(words,scores)
    return h.happiness_score()

def add_hedono(df):
    scores = load_scores(HEDONO_PATH)
    pattern_split = re.compile(r"\W+")
    df['hedono'] = df[TEXT_FIELD].map(lambda x:hmeter_sentiment(x,pattern_split,scores))
    return df

def add_vader(df):
    analyzer = SentimentIntensityAnalyzer()
    df['vader'] = df[TEXT_FIELD].map(lambda x:analyzer.polarity_scores(x)['compound'])
    return df
def add_all_sentiment(df):
    '''
    calculate sentiment scores for field TEXT_FIELD
    '''
    df = add_afinn(df)
    df = add_hedono(df)
    df = add_vader(df)
    return df

In [7]:
tweet_df = pd.read_csv("../../../data/tweets/csv/usa_tweets_2012_chunk_9.csv",lineterminator='\n',dtype={'id':str,'tweet_created_at':str})

  exec(code_obj, self.user_global_ns, self.user_ns)


In [21]:
PROJECT_ROOT = '/gpfs/data1/oshangp/liuz/sesync'
READ_PATH = PROJECT_ROOT+'/data/tweets/csv'
SAVE_PATH = PROJECT_ROOT+'/data/processed/sen'
donefiles = os.listdir(SAVE_PATH)
files = [os.path.join(READ_PATH, f) for f in os.listdir(READ_PATH) if f.endswith('.csv') and f not in donefiles]

In [24]:
files

['/gpfs/data1/oshangp/liuz/sesync/data/tweets/csv/usa_tweets_2014_a_chunk_12.csv',
 '/gpfs/data1/oshangp/liuz/sesync/data/tweets/csv/usa_tweets_2014_b_chunk_8.csv',
 '/gpfs/data1/oshangp/liuz/sesync/data/tweets/csv/usa_tweets_2014_c_chunk_11.csv',
 '/gpfs/data1/oshangp/liuz/sesync/data/tweets/csv/usa_tweets_2014_a_chunk_21.csv',
 '/gpfs/data1/oshangp/liuz/sesync/data/tweets/csv/usa_tweets_2012_chunk_9.csv']

In [25]:
tweet_df.shape

(1000000, 29)

In [21]:
#tweet_df[tweet_df['fastText_lang']=='tl'].sample(20).pure_text

982358            @_CathyVazquez @TheBucktList ay dios mio😶
657072    As usual, tinawanan lang ako. And may sira na ...
240282    Ilang years na ba akong di nakapagbowling........
467925         Magbabad sa bath ng 10 min then higa na. 👄👅👌
507199                             DJ ang in the hizz house
5878                   Sobrang chiks ng mga clippers dancer
155161    Sumigaw ako ng "natatae ako" nakalimutan kong ...
113428    Marami naman talaga iba jan e. Ikaw nga lang t...
902925    “@ateIrms: @kkkath000: Matteo at Sarah palalim...
485481    @Blessed7th @abbygaiil__ @just1n1nt1me22 ay ay...
957696                                         ay que tuani
551702                                   @Succ_Da_Lang wyo?
136755                               @TunityTV Gotcha. TKS!
954378    @kaiceVK14 kaya mag-UD ka na! Chos! Hahaha gal...
712509    @ayeemndz kayo nalang ni tatay ulit nay hahaha...
18845     Kahit madaming tao gusto ko parin pumunta bsta...
722926    @Coriyssa sarap naman ng buhay

In [26]:
def sentiment(df):
    df = df[df['pure_text'].notna()]
    df['tweet_created_at_int'] = df['tweet_created_at'].apply(lambda x: json.loads(x.replace("'", '"'))['$date'])
    df = df[['id','pure_text','tweet_created_at_int','fastText_lang','lat','lon']]
    df['clean_text'] = df['pure_text'].map(lambda x:keepemoji_clean(x))
    df['weather_term'] = df['clean_text'].map(lambda x:CheckWeatherTerm(x))
    df = add_all_sentiment(df)
    df = df.drop(columns=['pure_text','clean_text'])
    return df

In [17]:
tweet_df.iloc[i:i+5000]

NameError: name 'i' is not defined

In [66]:
 txt = tweet_df.iloc[762192:762193]['pure_text'].values[0]

In [67]:
txt = emoji.demojize(txt)

In [70]:
txt = '“@CalJDaGeneral45: @MHBEATS http://t.co/3P5Xpe4T”!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! '

In [69]:
txt

'“@CalJDaGeneral45: @MHBEATS http://t.co/3P5Xpe4T”!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'

In [71]:
p.clean(txt           )

KeyboardInterrupt: 

In [39]:
add_all_sentiment(tweet_df.iloc[23576:23577])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


Unnamed: 0,created_at,id,id_str,text,source,truncated,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,...,fastText_conf,tweet_created_at,user_created_at,pure_text,lat,lon,rt_text,afinn,hedono,vader
23576,Tue Apr 29 20:28:40 +0000 2014,461240673715224600,461240673715224576,Google+\n\nhttps://t.co/yZqsJwzaB2\n\n...........,"<a href=""http://twitter.com/download/android"" ...",False,,,,,...,0.86,{'$date': 1398803320000},{'$date': 1352905527000},Google+\n\nhttps://t.co/yZqsJwzaB2\n\n...........,42.949427,-87.894438,,0.0,5.456667,0.0


In [53]:
tweet_df.iloc[762192:762193].pure_text

762192    “@CalJDaGeneral45: @MHBEATS http://t.co/3P5Xpe...
Name: pure_text, dtype: object

In [74]:
sentiment(tweet_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


Could not handle the “@CalJDaGeneral45: @MHBEATS http://t.co/3P5Xpe4T”!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


KeyboardInterrupt: 

In [75]:
for i in range(750000,len(tweet_df),5000):
    sentiment(tweet_df.iloc[i:i+5000])
    print(i)

750000
755000
Could not handle the “@CalJDaGeneral45: @MHBEATS http://t.co/3P5Xpe4T”!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
760000
765000


KeyboardInterrupt: 

In [15]:
import time

# Start time
start_time = time.time()

# Your code here
# For example, a loop or a function call
sentiment_df = sentiment(tweet_df)
#sentiment_df['tweet_created_at'].apply(lambda x: json.loads(x.replace("'", '"'))['$date'])
#sentiment_df['tweet_created_at'].apply(lambda x: ast.literal_eval(x)['$date'])
# End time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

print(f"Elapsed time: {elapsed_time} seconds")

KeyboardInterrupt: 

In [22]:
sentiment_df

Unnamed: 0,id_str,tweet_created_at_int,lat,lon,weather_term,afinn,hedono,vader
0,550441276752293888,1420070401000,39.905443,-86.077531,0,-1.336306,4.924000,-0.2023
1,550441280661360641,1420070402000,33.987111,-83.984996,0,0.500000,6.280000,0.7672
2,550441280892067841,1420070402000,36.318364,-115.213250,0,0.904534,5.530000,0.3818
3,550441280858112000,1420070402000,34.374670,-118.592765,1,0.000000,5.705714,0.0000
4,550441280853901312,1420070402000,31.592575,-102.885392,0,0.000000,5.124000,0.0000
...,...,...,...,...,...,...,...,...
999995,552216771814772736,1420493712000,43.410781,-84.609748,0,0.000000,5.710000,0.0000
999996,552216771667587072,1420493712000,29.714756,-95.380429,0,1.459601,5.724286,0.7783
999997,552216775639969792,1420493713000,39.553933,-76.388266,0,0.000000,,0.0000
999998,552216775681515520,1420493713000,30.292801,-97.744604,0,0.267261,5.868333,0.1027


In [56]:
sentiment_df.query('weather_term > 0')

Unnamed: 0,id_str,tweet_created_at,weather_term,afinn,hedono,vader
3,550441280858112000,{'$date': 1420070402000},1,0.000000,5.705714,0.0000
6,550441280929406977,{'$date': 1420070402000},1,0.377964,5.660000,0.3182
51,550441330988421122,{'$date': 1420070414000},1,-1.224745,5.136667,-0.6447
72,550441356502376448,{'$date': 1420070420000},1,0.000000,5.502667,0.0000
121,550441410852179968,{'$date': 1420070433000},1,0.000000,5.320000,-0.3353
...,...,...,...,...,...,...
999863,552216528259915776,{'$date': 1420493654000},1,0.229416,5.570000,0.5707
999879,552216565920579584,{'$date': 1420493663000},1,1.066004,5.517778,0.8074
999947,552216683730173952,{'$date': 1420493691000},1,-0.242536,5.682000,0.4567
999959,552216708845666305,{'$date': 1420493697000},1,0.577350,7.200000,0.3182


In [23]:
tmp_set = set(tweet_df.columns.tolist())

In [23]:
# Folder path containing the CSV files
folder_path = "../../../data/tweets/csv/*.csv"

# Initialize a variable to keep track of the total row count
total_rows = 0

# Use glob to get a list of CSV file paths in the folder
csv_files = glob.glob(folder_path)

In [24]:
csv_files

['../../../data/tweets/csv/usa_tweets_2014_a_chunk_20.csv',
 '../../../data/tweets/csv/usa_tweets_2015_chunk_13.csv',
 '../../../data/tweets/csv/usa_tweets_2014_b_chunk_7.csv',
 '../../../data/tweets/csv/usa_tweets_2014_a_chunk_5.csv',
 '../../../data/tweets/csv/usa_tweets_2015_chunk_9.csv',
 '../../../data/tweets/csv/usa_tweets_2014_c_chunk_12.csv',
 '../../../data/tweets/csv/usa_tweets_2014_b_chunk_11.csv',
 '../../../data/tweets/csv/usa_tweets_2011_chunk_2.csv',
 '../../../data/tweets/csv/usa_tweets_2012_chunk_38.csv',
 '../../../data/tweets/csv/usa_tweets_2013_chunk_1.csv',
 '../../../data/tweets/csv/usa_tweets_2014_b_chunk_3.csv',
 '../../../data/tweets/csv/usa_tweets_2011_chunk_14.csv',
 '../../../data/tweets/csv/usa_tweets_2014_a_chunk_7.csv',
 '../../../data/tweets/csv/usa_tweets_2014_b_chunk_17.csv',
 '../../../data/tweets/csv/usa_tweets_2012_chunk_31.csv',
 '../../../data/tweets/csv/usa_tweets_2012_chunk_15.csv',
 '../../../data/tweets/csv/usa_tweets_2014_a_chunk_29.csv',
 '.

In [45]:
# for csv_file in csv_files:
#     tweet_df = pd.read_csv(csv_file,nrows=6)
#     d1 = tmp_set - set(tweet_df.columns.tolist())
#     d2 = set(tweet_df.columns.tolist()) - tmp_set
#     if d1:
#         print(csv_file,d1)
#     if d2:
#         print(csv_file,d2)

In [47]:
df = tweet_df

In [50]:
import os

In [25]:
import multiprocessing
def count_lines_in_csv(file_path):
    try:
        # with open(file_path, "r",newline='') as file:
        #     csv_reader = csv.reader(file)
        #     # Count the rows in the current CSV file and add to the total
        #     len_lines = sum(1 for row in csv_reader)
        #     #print(file_path)
        len_lines = pd.read_csv(file_path,lineterminator='\n').shape[0]
        return len_lines
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return 0  # Return 0 lines in case of an error
def count_language_in_csv(file_path):
    try:
        # with open(file_path, "r",newline='') as file:
        #     csv_reader = csv.reader(file)
        #     # Count the rows in the current CSV file and add to the total
        #     len_lines = sum(1 for row in csv_reader)
        #     #print(file_path)
        df = pd.read_csv(file_path,lineterminator='\n') 
        len_lines = df.shape[0]
        if 'fastText_lang' in df.columns:
            col_language = 'fastText_lang'
        else:
            print(f'no lang col in {file_path}')
            return 0,0
        lang_en = df[df[col_language]=='en'].shape[0]
        return len_lines,lang_en
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return 0,0  # Return 0 lines in case of an error

# Create a multiprocessing pool
pool = multiprocessing.Pool(20)

# Map the function to count lines to the file paths
#line_counts = pool.map(count_lines_in_csv, csv_files)
line_counts = pool.map(count_language_in_csv, csv_files)
# Close the pool and wait for the processes to finish
pool.close()
pool.join()

# Calculate the total lines by summing up individual counts
#total_lines = sum(line_counts)
#print(f"Total lines in all CSV files: {total_lines}")

# Separate and sum the elements
sum_a = sum(a for a, b in line_counts)
sum_b = sum(b for a, b in line_counts)



  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return lis

  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return lis

  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))


In [26]:
sum_a

231310037

In [27]:
sum_b

206192313

In [28]:
# percentage of english detect by 
sum_b/sum_a

0.8914110069508138

In [14]:
line_counts

[1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 367033,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 255910,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 490916,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 488941,
 475565,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000000,
 1000

../../../data/tweets/csv/usa_tweets_2011_chunk_2.csv


In [12]:
pd.read_csv('../../../data/tweets/csv/usa_tweets_2011_chunk_2.csv',lineterminator='\n').shape[0]

  exec(code_obj, self.user_global_ns, self.user_ns)


1000000