In [1]:
import pandas as pd
import numpy as np
import reverse_geocoder as rg
import preprocessor as p
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
import math
from multiprocessing import  Pool

In [2]:
df = pd.read_csv('./datasets/tweetsreplies4.tsv', encoding='cp1252', sep="\t", usecols=['timestamp_ms','longitude', 'latitude', 'text', 'lang'])

## Preprocessing
Remove rows with missing text, filter out non-english tweets. Get detailed locations and timestamp and tokenize the text.
* Fetch more location info from longitude and latitude using reverse_encoder https://github.com/thampiman/reverse-geocoder
* Convert ms timestamps to datetime object.
* Preprocess text using the preprocessor https://github.com/s/preprocessor and remove stopwords using gensim.

In [3]:
df = df.drop(585341+536) # this row makes the function p.clean crash...
df = df.drop(df[df.lang != 'en'].index)
df = df.drop(['lang'], axis=1)
df = df.drop(df[df.text == ''].index)
df = df.drop(df[df.text.isna()].index)
df = df.reset_index()
df = df.drop(['index'], axis=1)

In [4]:
coordinates = list(df[['latitude','longitude']].itertuples(index=False, name=None))
locations = rg.search(coordinates)

Loading formatted geocoded file...


In [5]:
locations_df = pd.json_normalize(locations)[['name', 'admin1', 'admin2', 'cc']]

In [6]:
df = pd.concat([df, locations_df], axis=1)

In [7]:
df = df.drop(df[df.cc != 'US'].index)

In [8]:
df

Unnamed: 0,timestamp_ms,longitude,latitude,text,name,admin1,admin2,cc
2,1443887501098,-122.228685,37.791994,@adamjohnsonNYC IT'S TRUE THOUGH,Alameda,California,Alameda County,US
3,1443888522674,-73.948775,40.655138,Wouldn't this analogy only make sense if it we...,Brooklyn,New York,Kings County,US
4,1443887588840,-122.630908,45.536402,@adamjohnsonNYC going to Taco Bell absolutely ...,Portland,Oregon,Multnomah County,US
5,1443887443914,-73.948775,40.655138,CONFIRMED: Justin Bieber is incredibly dumb h...,Brooklyn,New York,Kings County,US
11,1446737951938,-96.621634,32.907642,@OskaFlockaFlame this is crazy ðŸ˜©,Garland,Texas,Dallas County,US
...,...,...,...,...,...,...,...,...
260449,1450627502758,-93.417133,37.607955,@DrKenABC That Is Being Good With Your Money -...,Bolivar,Missouri,Polk County,US
260450,1450634807171,-73.113976,41.824678,@RereRhman @She_Devil643 @paparosso60 @aprilra...,Torrington,Connecticut,Litchfield County,US
260451,1450627608898,-75.117998,40.004866,So laid back they start assuming..ðŸ¤? https:/...,Philadelphia,Pennsylvania,Philadelphia County,US
260452,1450629163634,-71.180979,42.395497,this has been a photo album on my phone for li...,Belmont,Massachusetts,Middlesex County,US


In [9]:
def parallelize_dataframe(df, func, n_cores=4):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [10]:
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.NUMBER)
def add_features(df):
    try:
        df['text'] = df['text'].apply(p.clean)
    except:
        print(df['text'])
    return df
df = parallelize_dataframe(df, add_features)

In [11]:
# make lower case and remove numbers and white space, must be done after cleaning
df['text']  = df['text'] \
                .str.lower() \
                .str.replace('\d+', '') \
                .str.replace('a{2,}', 'a') \
                .str.replace('b{3,}', 'b') \
                .str.replace('c{3,}', 'c') \
                .str.replace('d{3,}', 'd') \
                .str.replace('e{3,}', 'e') \
                .str.replace('f{3,}', 'f') \
                .str.replace('g{3,}', 'g') \
                .str.replace('h{3,}', 'h') \
                .str.replace('i{2,}', 'i') \
                .str.replace('j{3,}', 'j') \
                .str.replace('k{3,}', 'k') \
                .str.replace('l{3,}', 'l') \
                .str.replace('m{3,}', 'm') \
                .str.replace('n{3,}', 'n') \
                .str.replace('o{3,}', 'o') \
                .str.replace('p{3,}', 'p') \
                .str.replace('q{3,}', 'q') \
                .str.replace('r{3,}', 'r') \
                .str.replace('s{3,}', 's') \
                .str.replace('t{3,}', 't') \
                .str.replace('u{2,}', 'u') \
                .str.replace('v{3,}', 'v') \
                .str.replace('w{3,}', 'w') \
                .str.replace('x{3,}', 'x') \
                .str.replace('y{2,}', 'y') \
                .str.replace('z{3,}', 'z') \
                .str.replace('_', ' ') \
                .str.replace(' rt ', '') \
                .str.replace('#', '') \
                .str.replace('[^\w\s]',' ') \
                .str.replace('\s\s+', ' ')

In [12]:
# remove stopwords, must be done after cleaning and removal of white space
def remove_stopwords_f(df):
    df['text'] = df['text'].apply(remove_stopwords)
    return df
df = parallelize_dataframe(df, remove_stopwords_f)

In [13]:
# remove words with less than 3 characters, should be done after removing stop words
remove_short_words = lambda x: ' '.join([item for item in x.split(" ") if len(item) > 2])
def remove_short_words_f(df):
    df['text'] = df['text'].apply(remove_short_words)
    return df
df = parallelize_dataframe(df, remove_short_words_f)
df

Unnamed: 0,timestamp_ms,longitude,latitude,text,name,admin1,admin2,cc
2,1443887501098,-122.228685,37.791994,true,Alameda,California,Alameda County,US
3,1443888522674,-73.948775,40.655138,wouldn analogy sense went church eat christians,Brooklyn,New York,Kings County,US
4,1443887588840,-122.630908,45.536402,going taco bell absolutely makes taco,Portland,Oregon,Multnomah County,US
5,1443887443914,-73.948775,40.655138,confirmed justin bieber incredibly dumb,Brooklyn,New York,Kings County,US
11,1446737951938,-96.621634,32.907642,crazy,Garland,Texas,Dallas County,US
...,...,...,...,...,...,...,...,...
260449,1450627502758,-93.417133,37.607955,good money trade imbalances tolerated,Bolivar,Missouri,Polk County,US
260450,1450634807171,-73.113976,41.824678,lovely,Torrington,Connecticut,Litchfield County,US
260451,1450627608898,-75.117998,40.004866,laid start assuming,Philadelphia,Pennsylvania,Philadelphia County,US
260452,1450629163634,-71.180979,42.395497,photo album phone like months loved machina fa...,Belmont,Massachusetts,Middlesex County,US


In [14]:
df = df.drop(df[df.text == ''].index)

In [15]:
df = df.sort_values(by=['timestamp_ms', 'admin1'])

In [16]:
df = df.drop(['admin2'], axis=1)

In [17]:
#df = df.head(1000)
df

Unnamed: 0,timestamp_ms,longitude,latitude,text,name,admin1,cc
5,1443887443914,-73.948775,40.655138,confirmed justin bieber incredibly dumb,Brooklyn,New York,US
2,1443887501098,-122.228685,37.791994,true,Alameda,California,US
4,1443887588840,-122.630908,45.536402,going taco bell absolutely makes taco,Portland,Oregon,US
12,1443888294480,-100.076888,31.168893,atlanta backwards atlanta,Eden,Texas,US
18,1443888510207,-89.266507,39.739300,republican strategist actually said morning ch...,Edinburg,Illinois,US
...,...,...,...,...,...,...,...
162969,1569634560964,-115.223125,36.232915,literally snol snort loud,Las Vegas,Nevada,US
31959,1570185297888,-118.411907,34.020789,clip basic reality right skid row sleep thou,Culver City,California,US
165435,1570805944991,-85.658852,38.227750,com story php story fbid amp amp ref bookmarks,Saint Matthews,Kentucky,US
164505,1572644823143,-96.331528,32.975669,loves magic ent roysecity,Royse City,Texas,US


In [49]:
df['date'] = pd.to_datetime(df.timestamp_ms, unit='ms').dt.date
df['groups'] = (df.date.diff().dt.days > 1).cumsum()
df = df.drop(df[df.groups != 0].index)
df = df.drop(['date', 'groups'], axis=1)
df.sort_values(by=['timestamp_ms'])

Unnamed: 0,timestamp_ms,longitude,latitude,text,name,admin1,cc
5,1443887443914,-73.948775,40.655138,confirmed justin bieber incredibly dumb,Brooklyn,New York,US
2,1443887501098,-122.228685,37.791994,true,Alameda,California,US
4,1443887588840,-122.630908,45.536402,going taco bell absolutely makes taco,Portland,Oregon,US
12,1443888294480,-100.076888,31.168893,atlanta backwards atlanta,Eden,Texas,US
18,1443888510207,-89.266507,39.739300,republican strategist actually said morning ch...,Edinburg,Illinois,US
...,...,...,...,...,...,...,...
155600,1460351942580,-80.007367,32.770073,seconds retweet missing year old woman old son...,Charleston,South Carolina,US
72169,1460357254132,-81.367773,28.501679,tell black men need stick support gay dick riders,Edgewood,Florida,US
225927,1460361818106,-122.435978,37.770657,lit,San Francisco,California,US
35407,1460385235324,-96.785531,32.604786,group,Lancaster,Texas,US


In [48]:
fmt = '%d\n%.8f\n%.8f\n%s\n%s\n%s\n%s'
np.savetxt(r'dataset85257EnUS.txt', df.values, fmt=fmt, delimiter='\r\n')

In [22]:
fmt = '%d\n%.8f\n%.8f\n%s\n%s\n%s\n%s'
df.to_csv(r'dataset1000En.csv')