In [1]:
import pandas as pd
import numpy as np
import reverse_geocoder as rg
import preprocessor as p
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
import math
from multiprocessing import  Pool

In [2]:
#pd.set_option("display.max_rows", None, "display.max_columns", None)

In [3]:
df = pd.read_csv('./datasets/tweetsreplies4.tsv', encoding='cp1252', sep="\t", usecols=['timestamp_ms','longitude', 'latitude', 'text', 'lang'])

## Preprocessing
Remove rows with missing text, filter out non-english tweets. Get detailed locations and timestamp and tokenize the text.
* Fetch more location info from longitude and latitude using reverse_encoder https://github.com/thampiman/reverse-geocoder
* Convert ms timestamps to datetime object.
* Preprocess text using the preprocessor https://github.com/s/preprocessor and remove stopwords using gensim.

In [4]:
df = df.drop(585341+536) # this row makes the function p.clean crash...
df = df.drop(df[df.lang != 'en'].index)
df = df.drop(['lang'], axis=1)
df = df.drop(df[df.text == ''].index)
df = df.drop(df[df.text.isna()].index)
df = df.reset_index()
df = df.drop(['index'], axis=1)

In [5]:
coordinates = list(df[['latitude','longitude']].itertuples(index=False, name=None))
locations = rg.search(coordinates)

Loading formatted geocoded file...


In [6]:
locations_df = pd.json_normalize(locations)[['name', 'admin1', 'admin2', 'cc']]

In [7]:
df = pd.concat([df, locations_df], axis=1)

In [8]:
#df = df.drop(df[df.cc != 'US'].index)

In [9]:
def parallelize_dataframe(df, func, n_cores=4):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [10]:
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.NUMBER)
def add_features(df):
    try:
        df['text'] = df['text'].apply(p.clean)
    except:
        print(df['text'])
    return df
df = parallelize_dataframe(df, add_features)

In [11]:
# make lower case and remove numbers and white space, must be done after cleaning
df['text']  = df['text'] \
                .str.lower() \
                .str.replace('\d+', '') \
                .str.replace('a{2,}', 'a') \
                .str.replace('b{3,}', 'b') \
                .str.replace('c{3,}', 'c') \
                .str.replace('d{3,}', 'd') \
                .str.replace('e{3,}', 'e') \
                .str.replace('f{3,}', 'f') \
                .str.replace('g{3,}', 'g') \
                .str.replace('h{3,}', 'h') \
                .str.replace('i{2,}', 'i') \
                .str.replace('j{3,}', 'j') \
                .str.replace('k{3,}', 'k') \
                .str.replace('l{3,}', 'l') \
                .str.replace('m{3,}', 'm') \
                .str.replace('n{3,}', 'n') \
                .str.replace('o{3,}', 'o') \
                .str.replace('p{3,}', 'p') \
                .str.replace('q{3,}', 'q') \
                .str.replace('r{3,}', 'r') \
                .str.replace('s{3,}', 's') \
                .str.replace('t{3,}', 't') \
                .str.replace('u{2,}', 'u') \
                .str.replace('v{3,}', 'v') \
                .str.replace('w{3,}', 'w') \
                .str.replace('x{3,}', 'x') \
                .str.replace('y{2,}', 'y') \
                .str.replace('z{3,}', 'z') \
                .str.replace('_', ' ') \
                .str.replace(' rt ', '') \
                .str.replace('#', '') \
                .str.replace('[^\w\s]',' ') \
                .str.replace('\s\s+', ' ')

In [12]:
# remove stopwords, must be done after cleaning and removal of white space
def remove_stopwords_f(df):
    df['text'] = df['text'].apply(remove_stopwords)
    return df
df = parallelize_dataframe(df, remove_stopwords_f)

In [13]:
# remove words with less than 3 characters, should be done after removing stop words
remove_short_words = lambda x: ' '.join([item for item in x.split(" ") if len(item) > 2])
def remove_short_words_f(df):
    df['text'] = df['text'].apply(remove_short_words)
    return df
df = parallelize_dataframe(df, remove_short_words_f)

In [15]:
# df = df.head(10000)

In [16]:
def remove_short_texts_f(df):
    df['c'] = df['text'].apply(lambda x: len(x.split(" ")))
    df = df.drop(df[df.c < 3].index)
    df = df.drop(['c'], axis=1)
    return df

# Remove tweets with less than 3 words.
df = parallelize_dataframe(df, remove_short_texts_f)
df

Unnamed: 0,timestamp_ms,longitude,latitude,text,name,admin1,admin2,cc
1,1443887442007,-66.578926,6.422820,quiero que sea sponsor zayn malik goals rts sa...,Puerto Carreno,Vichada,,CO
3,1443888522674,-73.948775,40.655138,wouldn analogy sense went church eat christians,Brooklyn,New York,Kings County,US
4,1443887588840,-122.630908,45.536402,going taco bell absolutely makes taco,Portland,Oregon,Multnomah County,US
5,1443887443914,-73.948775,40.655138,confirmed justin bieber incredibly dumb,Brooklyn,New York,Kings County,US
8,1443891813050,-2.298134,52.847090,aww thank gillian olly follow thank,Eccleshall,England,Staffordshire,GB
...,...,...,...,...,...,...,...,...
9995,1444214110880,88.352242,22.612706,favu bcoz knowv sidehero,Chakapara,West Bengal,Haora,IN
9996,1444214385820,72.570232,23.013959,current state amethi best example hard work,Ahmedabad,Gujarat,Ahmadabad,IN
9997,1444214711324,72.570232,23.013959,people alleged development work took place vil...,Ahmedabad,Gujarat,Ahmadabad,IN
9998,1444214681705,72.570232,23.013959,people darkha village comes amethi decided boy...,Ahmedabad,Gujarat,Ahmadabad,IN


In [17]:
df = df.drop(df[df.text == ''].index)

In [18]:
df = df.sort_values(by=['timestamp_ms', 'admin1'])

In [19]:
df = df.drop(['admin2'], axis=1)

In [20]:
df = df.head(1000)
df

Unnamed: 0,timestamp_ms,longitude,latitude,text,name,admin1,cc
1,1443887442007,-66.578926,6.422820,quiero que sea sponsor zayn malik goals rts sa...,Puerto Carreno,Vichada,CO
5,1443887443914,-73.948775,40.655138,confirmed justin bieber incredibly dumb,Brooklyn,New York,US
4,1443887588840,-122.630908,45.536402,going taco bell absolutely makes taco,Portland,Oregon,US
12,1443888294480,-100.076888,31.168893,atlanta backwards atlanta,Eden,Texas,US
18,1443888510207,-89.266507,39.739300,republican strategist actually said morning ch...,Edinburg,Illinois,US
...,...,...,...,...,...,...,...
809,1443929955976,-79.980689,40.431389,currently mario shadyside handing koozies,Pittsburgh,Pennsylvania,US
229,1443930099195,-82.388215,36.344668,thanks appreciate makeadifference addvalue tod...,Johnson City,Tennessee,US
1696,1443930134934,-117.434222,47.667438,shit pants stage stay tuned,Spokane,Washington,US
1707,1443930511894,-100.076888,31.168893,okay astros fans weve torn rooting agree want ...,Eden,Texas,US


In [None]:
df['date'] = pd.to_datetime(df.timestamp_ms, unit='ms').dt.date
df['groups'] = (df.date.diff().dt.days > 1).cumsum()
df = df.drop(df[df.groups != 0].index)
df = df.drop(['date', 'groups'], axis=1)
df.sort_values(by=['timestamp_ms'])
df

In [48]:
fmt = '%d\n%.8f\n%.8f\n%s\n%s\n%s\n%s'
#np.savetxt(r'dataset85257EnUS.txt', df.values, fmt=fmt, delimiter='\r\n')

In [21]:
fmt = '%d\n%.8f\n%.8f\n%s\n%s\n%s\n%s'
df.to_csv(r'1000EnForSynth.csv')