In [26]:
import pandas as pd
import numpy as np
import reverse_geocoder as rg
import preprocessor as p
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
import math
from multiprocessing import  Pool

In [27]:
df = pd.read_csv('../../Semesteroppgave/datasets/dataset/dataset_twitter/tweetsreplies4.tsv', encoding='cp1252', sep="\t", usecols=['timestamp_ms','longitude', 'latitude', 'text', 'lang'])

## Preprocessing
Remove rows with missing text, filter out non-english tweets. Get detailed locations and timestamp and tokenize the text.
* Fetch more location info from longitude and latitude using reverse_encoder https://github.com/thampiman/reverse-geocoder
* Convert ms timestamps to datetime object.
* Preprocess text using the preprocessor https://github.com/s/preprocessor and remove stopwords using gensim.

In [28]:
df = df.drop(585341+536) # this row makes the function p.clean crash...
df = df.drop(df[df.lang != 'en'].index)
df = df.drop(['lang'], axis=1)
df = df.drop(df[df.text == ''].index)
df = df.drop(df[df.text.isna()].index)
df = df.reset_index()
df = df.drop(['index'], axis=1)

In [29]:
coordinates = list(df[['latitude','longitude']].itertuples(index=False, name=None))
locations = rg.search(coordinates)

In [30]:
locations_df = pd.json_normalize(locations)[['name', 'admin1', 'admin2', 'cc']]

In [31]:
df = pd.concat([df, locations_df], axis=1)

In [32]:
def parallelize_dataframe(df, func, n_cores=4):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [33]:
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.NUMBER)
def add_features(df):
    try:
        df['text'] = df['text'].apply(p.clean)
    except:
        print(df['text'])
    return df
df = parallelize_dataframe(df, add_features)

In [34]:
# make lower case and remove numbers and white space, must be done after cleaning
df['text']  = df['text'] \
                .str.lower() \
                .str.replace('\d+', '') \
                .str.replace('a{2,}', 'a') \
                .str.replace('b{3,}', 'b') \
                .str.replace('c{3,}', 'c') \
                .str.replace('d{3,}', 'd') \
                .str.replace('e{3,}', 'e') \
                .str.replace('f{3,}', 'f') \
                .str.replace('g{3,}', 'g') \
                .str.replace('h{3,}', 'h') \
                .str.replace('i{2,}', 'i') \
                .str.replace('j{3,}', 'j') \
                .str.replace('k{3,}', 'k') \
                .str.replace('l{3,}', 'l') \
                .str.replace('m{3,}', 'm') \
                .str.replace('n{3,}', 'n') \
                .str.replace('o{3,}', 'o') \
                .str.replace('p{3,}', 'p') \
                .str.replace('q{3,}', 'q') \
                .str.replace('r{3,}', 'r') \
                .str.replace('s{3,}', 's') \
                .str.replace('t{3,}', 't') \
                .str.replace('u{2,}', 'u') \
                .str.replace('v{3,}', 'v') \
                .str.replace('w{3,}', 'w') \
                .str.replace('x{3,}', 'x') \
                .str.replace('y{2,}', 'y') \
                .str.replace('z{3,}', 'z') \
                .str.replace('_', ' ') \
                .str.replace(' rt ', '') \
                .str.replace('#', '') \
                .str.replace('[^\w\s]',' ') \
                .str.replace('\s\s+', ' ')

In [35]:
# remove stopwords, must be done after cleaning and removal of white space
def remove_stopwords_f(df):
    df['text'] = df['text'].apply(remove_stopwords)
    return df
df = parallelize_dataframe(df, remove_stopwords_f)

In [36]:
df = df.drop(df[df.text == ''].index)

In [37]:
df = df.sort_values(by=['timestamp_ms', 'admin1'])

In [38]:
df = df.drop(['admin2'], axis=1)

In [39]:
df = df.head(1000)
df

Unnamed: 0,timestamp_ms,longitude,latitude,text,name,admin1,cc
1,1443887442007,-66.578926,6.422820,yo quiero que sea mi sponsor zayn malik goals ...,Puerto Carreno,Vichada,CO
5,1443887443914,-73.948775,40.655138,confirmed justin bieber incredibly dumb,Brooklyn,New York,US
2,1443887501098,-122.228685,37.791994,s true,Alameda,California,US
4,1443887588840,-122.630908,45.536402,going taco bell absolutely makes taco,Portland,Oregon,US
7,1443888128338,21.060742,52.232836,ofc,Praga Poludnie,Masovian Voivodeship,PL
...,...,...,...,...,...,...,...
1334,1443916393180,-79.272569,43.629311,tourtoronto tweeryourseat kidding sec row seats,Scarborough,Ontario,CA
529,1443916421554,-60.029848,-37.147576,s oh quiet emabiggestfansjustinbieber,Olavarria,Buenos Aires,AR
1305,1443916429182,-43.441578,-22.911422,cem rts cem vote pelo justin iwannahearwdymons...,Nilopolis,Rio de Janeiro,BR
726,1443916444910,-60.029848,-37.147576,u emabiggestfansjustinbieber,Olavarria,Buenos Aires,AR


In [40]:
fmt = '%d\n%.8f\n%.8f\n%s\n%s\n%s\n%s'
#np.savetxt(r'dataset5000En.txt', df.values, fmt=fmt, delimiter='\r\n')

In [41]:
fmt = '%d\n%.8f\n%.8f\n%s\n%s\n%s\n%s'
df.to_csv(r'dataset1000En.csv')

In [17]:
p.clean("#what is https://vg.no @me FAV00 00 RT _")

'#what is FAV00 RT _'