In [5]:
import pandas as pd
import numpy as np
import reverse_geocoder as rg
import preprocessor as p
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
import math
from multiprocessing import  Pool

In [6]:
#pd.set_option("display.max_rows", None, "display.max_columns", None)

In [9]:
it = pd.read_csv('./datasets/tweets1_33_1_100k.txt', encoding='utf8', 
                 sep="\t", iterator=True, usecols=['timestamp_ms','longitude', 'latitude', 'text', 'lang'])
df = it.get_chunk(100000)
df

Unnamed: 0,timestamp_ms,longitude,latitude,lang,text
0,1506352756521,-73.976086,40.752321,en,Me encanta 😍! (@ Grand Central Market in New Y...
1,1506352756950,-120.598328,37.350299,en,Recovery mode
2,1506352757130,-95.446487,29.838495,en,Not everyone can do good hair to me
3,1506352757145,-116.527000,32.595000,en,Sgt. Manciati and Detective Zito on the prowl ...
4,1506352757145,-80.313674,25.712215,en,Beelion https://t.co/x1JKsC8mc5
...,...,...,...,...,...
98463,1506357870154,-81.382695,40.885188,en,I can't wait to move to Kent
98464,1506357870451,-72.650649,41.562321,en,"Want to work in #Middletown, CT? View our late..."
98465,1506357870393,-106.417430,31.793368,en,He so amazing https://t.co/PtggVOAS0a
98466,1506357870428,-87.478814,41.638191,en,Can't get ova her 😜 https://t.co/8h5AhjzPpi


## Preprocessing
Remove rows with missing text, filter out non-english tweets. Get detailed locations and timestamp and tokenize the text.
* Fetch more location info from longitude and latitude using reverse_encoder https://github.com/thampiman/reverse-geocoder
* Convert ms timestamps to datetime object.
* Preprocess text using the preprocessor https://github.com/s/preprocessor and remove stopwords using gensim.

In [11]:
#df = df.drop(585341+536) # this row makes the function p.clean crash...
df = df.drop(df[df.lang != 'en'].index)
df = df.drop(['lang'], axis=1)
df = df.drop(df[df.text == ''].index)
df = df.drop(df[df.text.isna()].index)
df = df.reset_index()
df = df.drop(['index'], axis=1)

In [12]:
coordinates = list(df[['latitude','longitude']].itertuples(index=False, name=None))
locations = rg.search(coordinates)

Loading formatted geocoded file...


In [13]:
locations_df = pd.json_normalize(locations)[['name', 'admin1', 'admin2', 'cc']]

In [14]:
df = pd.concat([df, locations_df], axis=1)

In [15]:
#df = df.drop(df[df.cc != 'US'].index)

In [16]:
def parallelize_dataframe(df, func, n_cores=4):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [17]:
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.NUMBER)
def add_features(df):
    try:
        df['text'] = df['text'].apply(p.clean)
    except:
        print(df['text'])
    return df
df = parallelize_dataframe(df, add_features)

In [18]:
# make lower case and remove numbers and white space, must be done after cleaning
df['text']  = df['text'] \
                .str.lower() \
                .str.replace('\d+', '') \
                .str.replace('a{2,}', 'a') \
                .str.replace('b{3,}', 'b') \
                .str.replace('c{3,}', 'c') \
                .str.replace('d{3,}', 'd') \
                .str.replace('e{3,}', 'e') \
                .str.replace('f{3,}', 'f') \
                .str.replace('g{3,}', 'g') \
                .str.replace('h{3,}', 'h') \
                .str.replace('i{2,}', 'i') \
                .str.replace('j{3,}', 'j') \
                .str.replace('k{3,}', 'k') \
                .str.replace('l{3,}', 'l') \
                .str.replace('m{3,}', 'm') \
                .str.replace('n{3,}', 'n') \
                .str.replace('o{3,}', 'o') \
                .str.replace('p{3,}', 'p') \
                .str.replace('q{3,}', 'q') \
                .str.replace('r{3,}', 'r') \
                .str.replace('s{3,}', 's') \
                .str.replace('t{3,}', 't') \
                .str.replace('u{2,}', 'u') \
                .str.replace('v{3,}', 'v') \
                .str.replace('w{3,}', 'w') \
                .str.replace('x{3,}', 'x') \
                .str.replace('y{2,}', 'y') \
                .str.replace('z{3,}', 'z') \
                .str.replace('_', ' ') \
                .str.replace(' rt ', '') \
                .str.replace('#', '') \
                .str.replace('[^\w\s]',' ') \
                .str.replace('\s\s+', ' ')

In [19]:
# remove stopwords, must be done after cleaning and removal of white space
def remove_stopwords_f(df):
    df['text'] = df['text'].apply(remove_stopwords)
    return df
df = parallelize_dataframe(df, remove_stopwords_f)

In [20]:
# remove words with less than 3 characters, should be done after removing stop words
remove_short_words = lambda x: ' '.join([item for item in x.split(" ") if len(item) > 2])
def remove_short_words_f(df):
    df['text'] = df['text'].apply(remove_short_words)
    return df
df = parallelize_dataframe(df, remove_short_words_f)

In [22]:
df = df.head(10000)

In [23]:
def remove_short_texts_f(df):
    df['c'] = df['text'].apply(lambda x: len(x.split(" ")))
    df = df.drop(df[df.c < 3].index)
    df = df.drop(['c'], axis=1)
    return df

# Remove tweets with less than 3 words.
df = parallelize_dataframe(df, remove_short_texts_f)
df

Unnamed: 0,timestamp_ms,longitude,latitude,text,name,admin1,admin2,cc
0,1506352756521,-73.976086,40.752321,encanta grand central market new york,Long Island City,New York,Queens County,US
3,1506352757145,-116.527000,32.595000,sgt manciati detective zito prowl divineknight...,Hacienda Tecate,Baja California,Tecate,MX
5,1506352757147,-116.971680,44.027501,days season premiere lawandordersvu svu,Ontario,Oregon,Malheur County,US
6,1506352757249,-122.435978,37.770657,afcvwba today points come arsenal afc westbrom...,San Francisco,California,San Francisco County,US
9,1506352757550,-85.557614,38.194744,yes potus hates disrespects people excersizing...,Jeffersontown,Kentucky,Jefferson County,US
...,...,...,...,...,...,...,...,...
9990,1506353280048,-121.873730,37.331159,epic time california entries days amp today da...,San Jose,California,Santa Clara County,US
9992,1506353280169,-105.550891,38.997936,gem know birthday,Cripple Creek,Colorado,Teller County,US
9997,1506353281121,-73.968542,40.780709,trump invites russian national basketball team...,Manhattan,New York,New York County,US
9998,1506353281217,-79.420865,38.003375,look perches large wild cockatoo parrots birbs,East Lexington,Virginia,Rockbridge County,US


In [24]:
df = df.drop(df[df.text == ''].index)

In [25]:
df = df.sort_values(by=['timestamp_ms', 'admin1'])

In [26]:
df = df.drop(['admin2'], axis=1)

In [27]:
df = df.head(1000)
df

Unnamed: 0,timestamp_ms,longitude,latitude,text,name,admin1,cc
0,1506352756521,-73.976086,40.752321,encanta grand central market new york,Long Island City,New York,US
3,1506352757145,-116.527000,32.595000,sgt manciati detective zito prowl divineknight...,Hacienda Tecate,Baja California,MX
5,1506352757147,-116.971680,44.027501,days season premiere lawandordersvu svu,Ontario,Oregon,US
6,1506352757249,-122.435978,37.770657,afcvwba today points come arsenal afc westbrom...,San Francisco,California,US
9,1506352757550,-85.557614,38.194744,yes potus hates disrespects people excersizing...,Jeffersontown,Kentucky,US
...,...,...,...,...,...,...,...
1243,1506352822431,-75.698660,45.388680,canal carleton university official,Ottawa,Ontario,CA
1244,1506352822476,-84.273139,33.248881,funny ass nigga job trying best ignore till ye...,Griffin,Georgia,US
1253,1506352822774,-78.878866,33.703391,dale pretty cool,Myrtle Beach,South Carolina,US
1247,1506352822776,-76.255686,36.895082,today agenda study test,Norfolk,Virginia,US


In [None]:
df['date'] = pd.to_datetime(df.timestamp_ms, unit='ms').dt.date
df['groups'] = (df.date.diff().dt.days > 1).cumsum()
df = df.drop(df[df.groups != 0].index)
df = df.drop(['date', 'groups'], axis=1)
df.sort_values(by=['timestamp_ms'])
df

In [48]:
fmt = '%d\n%.8f\n%.8f\n%s\n%s\n%s\n%s'
#np.savetxt(r'dataset85257EnUS.txt', df.values, fmt=fmt, delimiter='\r\n')

In [28]:
fmt = '%d\n%.8f\n%.8f\n%s\n%s\n%s\n%s'
df.to_csv(r'1_33_1000EnForSynth.csv')