In [2]:
import pandas as pd
import numpy as np
import reverse_geocoder as rg
import preprocessor as p
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
import math
from multiprocessing import  Pool

In [3]:
#pd.set_option("display.max_rows", None, "display.max_columns", None)

In [5]:
df = pd.read_csv('./datasets/tweetsreplies4.tsv', encoding='cp1252', 
                 sep="\t", usecols=['timestamp_ms','longitude', 'latitude', 'text', 'lang'])
#df = it.get_chunk(100000)
df

Unnamed: 0,timestamp_ms,longitude,latitude,lang,text
0,1443887012127,-3.703508,40.477795,es,El BarÃ§a lleva 5 penaltis a favor y el Madrid...
1,1443891910375,-3.612607,39.069300,es,@As_TomasRoncero tu lo k debes hacer es callar...
2,1443906913232,-3.816153,40.524724,es,@As_TomasRoncero ya cambiara eso subnormal tod...
3,1443887056044,-5.923545,37.383117,es,@As_TomasRoncero Envidia no?
4,1443887085262,-3.816153,40.524724,und,@As_TomasRoncero http://t.co/muugC3nrE3
...,...,...,...,...,...
992056,1450850972308,-48.722283,-26.827420,pt,@badlarryheroes vi teu user e lembrei de uma f...
992057,1450628369225,-48.722283,-26.827420,pt,"niallbabykitten, acho que jÃ¡ te falei mas, vo..."
992058,1450655347711,-48.722283,-26.827420,pt,leighadegirl anna banana disse que me ama nana...
992059,1450629163634,-71.180979,42.395497,en,this has been a photo album on my phone for li...


## Preprocessing
Remove rows with missing text, filter out non-english tweets. Get detailed locations and timestamp and tokenize the text.
* Fetch more location info from longitude and latitude using reverse_encoder https://github.com/thampiman/reverse-geocoder
* Convert ms timestamps to datetime object.
* Preprocess text using the preprocessor https://github.com/s/preprocessor and remove stopwords using gensim.

In [6]:
#df = df.drop(585341+536) # this row makes the function p.clean crash...
df = df.drop(df[df.lang != 'en'].index)
df = df.drop(['lang'], axis=1)
df = df.drop(df[df.text == ''].index)
df = df.drop(df[df.text.isna()].index)
df = df.reset_index()
df = df.drop(['index'], axis=1)

In [7]:
coordinates = list(df[['latitude','longitude']].itertuples(index=False, name=None))
locations = rg.search(coordinates)

Loading formatted geocoded file...


In [8]:
locations_df = pd.json_normalize(locations)[['name', 'admin1', 'admin2', 'cc']]

In [9]:
df = pd.concat([df, locations_df], axis=1)

In [10]:
df = df.drop(df[df.cc != 'US'].index)

In [11]:
def parallelize_dataframe(df, func, n_cores=4):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [None]:
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.NUMBER)
def add_features(df):
    try:
        df['text'] = df['text'].apply(p.clean)
    except:
        print(df['text'])
    return df
df = parallelize_dataframe(df, add_features)

In [None]:
# make lower case and remove numbers and white space, must be done after cleaning
df['text']  = df['text'] \
                .str.lower() \
                .str.replace('\d+', '') \
                .str.replace('a{2,}', 'a') \
                .str.replace('b{3,}', 'b') \
                .str.replace('c{3,}', 'c') \
                .str.replace('d{3,}', 'd') \
                .str.replace('e{3,}', 'e') \
                .str.replace('f{3,}', 'f') \
                .str.replace('g{3,}', 'g') \
                .str.replace('h{3,}', 'h') \
                .str.replace('i{2,}', 'i') \
                .str.replace('j{3,}', 'j') \
                .str.replace('k{3,}', 'k') \
                .str.replace('l{3,}', 'l') \
                .str.replace('m{3,}', 'm') \
                .str.replace('n{3,}', 'n') \
                .str.replace('o{3,}', 'o') \
                .str.replace('p{3,}', 'p') \
                .str.replace('q{3,}', 'q') \
                .str.replace('r{3,}', 'r') \
                .str.replace('s{3,}', 's') \
                .str.replace('t{3,}', 't') \
                .str.replace('u{2,}', 'u') \
                .str.replace('v{3,}', 'v') \
                .str.replace('w{3,}', 'w') \
                .str.replace('x{3,}', 'x') \
                .str.replace('y{2,}', 'y') \
                .str.replace('z{3,}', 'z') \
                .str.replace('_', ' ') \
                .str.replace(' rt ', '') \
                .str.replace('#', '') \
                .str.replace('[^\w\s]',' ') \
                .str.replace('\s\s+', ' ')

In [None]:
# remove stopwords, must be done after cleaning and removal of white space
def remove_stopwords_f(df):
    df['text'] = df['text'].apply(remove_stopwords)
    return df
df = parallelize_dataframe(df, remove_stopwords_f)

In [None]:
# remove words with less than 3 characters, should be done after removing stop words
remove_short_words = lambda x: ' '.join([item for item in x.split(" ") if len(item) > 2])
def remove_short_words_f(df):
    df['text'] = df['text'].apply(remove_short_words)
    return df
df = parallelize_dataframe(df, remove_short_words_f)

In [83]:
#df = df.head(20000)

In [84]:
def remove_short_texts_f(df):
    df['c'] = df['text'].apply(lambda x: len(x.split(" ")))
    df = df.drop(df[df.c < 3].index)
    df = df.drop(['c'], axis=1)
    return df

# Remove tweets with less than 3 words.
df = parallelize_dataframe(df, remove_short_texts_f)
df

Unnamed: 0,timestamp_ms,longitude,latitude,text,name,admin1,admin2,cc
0,1506352756521,-73.976086,40.752321,encanta grand central market new york,Long Island City,New York,Queens County,US
3,1506352757145,-116.527000,32.595000,sgt manciati detective zito prowl divineknight...,Hacienda Tecate,Baja California,Tecate,MX
5,1506352757147,-116.971680,44.027501,days season premiere lawandordersvu svu,Ontario,Oregon,Malheur County,US
6,1506352757249,-122.435978,37.770657,afcvwba today points come arsenal afc westbrom...,San Francisco,California,San Francisco County,US
9,1506352757550,-85.557614,38.194744,yes potus hates disrespects people excersizing...,Jeffersontown,Kentucky,Jefferson County,US
...,...,...,...,...,...,...,...,...
19995,1506353809256,-79.383184,43.653226,interested job toronto great fit hospitality h...,Toronto,Ontario,,CA
19996,1506353809369,-95.834958,41.247487,like stop raining seconds class,Council Bluffs,Iowa,Pottawattamie County,US
19997,1506353809323,-96.291648,30.585472,accidentally fell asleep balcony door open wok...,College Station,Texas,Brazos County,US
19998,1506353809391,-86.939741,35.742433,latest springhill job click apply beauty advis...,Spring Hill,Tennessee,Maury County,US


In [85]:
df = df.drop(df[df.text == ''].index)

In [86]:
df = df.sort_values(by=['timestamp_ms', 'admin1'])

In [87]:
df = df.drop(['admin2'], axis=1)

In [88]:
df = df.head(10000)
df

Unnamed: 0,timestamp_ms,longitude,latitude,text,name,admin1,cc
0,1506352756521,-73.976086,40.752321,encanta grand central market new york,Long Island City,New York,US
3,1506352757145,-116.527000,32.595000,sgt manciati detective zito prowl divineknight...,Hacienda Tecate,Baja California,MX
5,1506352757147,-116.971680,44.027501,days season premiere lawandordersvu svu,Ontario,Oregon,US
6,1506352757249,-122.435978,37.770657,afcvwba today points come arsenal afc westbrom...,San Francisco,California,US
9,1506352757550,-85.557614,38.194744,yes potus hates disrespects people excersizing...,Jeffersontown,Kentucky,US
...,...,...,...,...,...,...,...
12332,1506353406238,-78.582572,35.694028,interested job garner great fit cosmetology hi...,Garner,North Carolina,US
12331,1506353406281,-87.732013,41.833585,lucky drafted starting wrs,Cicero,Illinois,US
12341,1506353406300,-111.931711,34.168253,missed hour come sub,Cordes Lakes,Arizona,US
12345,1506353406301,-84.433106,33.767195,imma boost hell cousin cuz understand proud great,Atlanta,Georgia,US


In [89]:
df['date'] = pd.to_datetime(df.timestamp_ms, unit='ms').dt.date
df['groups'] = (df.date.diff().dt.days > 1).cumsum()
df = df.drop(df[df.groups != 0].index)
df = df.drop(['date', 'groups'], axis=1)
df.sort_values(by=['timestamp_ms'])
df

Unnamed: 0,timestamp_ms,longitude,latitude,text,name,admin1,cc
0,1506352756521,-73.976086,40.752321,encanta grand central market new york,Long Island City,New York,US
3,1506352757145,-116.527000,32.595000,sgt manciati detective zito prowl divineknight...,Hacienda Tecate,Baja California,MX
5,1506352757147,-116.971680,44.027501,days season premiere lawandordersvu svu,Ontario,Oregon,US
6,1506352757249,-122.435978,37.770657,afcvwba today points come arsenal afc westbrom...,San Francisco,California,US
9,1506352757550,-85.557614,38.194744,yes potus hates disrespects people excersizing...,Jeffersontown,Kentucky,US
...,...,...,...,...,...,...,...
12332,1506353406238,-78.582572,35.694028,interested job garner great fit cosmetology hi...,Garner,North Carolina,US
12331,1506353406281,-87.732013,41.833585,lucky drafted starting wrs,Cicero,Illinois,US
12341,1506353406300,-111.931711,34.168253,missed hour come sub,Cordes Lakes,Arizona,US
12345,1506353406301,-84.433106,33.767195,imma boost hell cousin cuz understand proud great,Atlanta,Georgia,US


In [48]:
fmt = '%d\n%.8f\n%.8f\n%s\n%s\n%s\n%s'
#np.savetxt(r'dataset85257EnUS.txt', df.values, fmt=fmt, delimiter='\r\n')

In [90]:
fmt = '%d\n%.8f\n%.8f\n%s\n%s\n%s\n%s'
df.to_csv(r'1_33_10000EnForSynth.csv')