# PSTA

In [10]:
import pandas as pd
import numpy as np
import reverse_geocoder as rg
import preprocessor as p
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
import math
from multiprocessing import  Pool

In [11]:
df = pd.read_csv('../../Semesteroppgave/datasets/dataset/dataset_twitter/tweetsreplies4.tsv', encoding='cp1252', sep="\t", usecols=['timestamp_ms','longitude', 'latitude', 'text'])

## Preprocessing
Get detailed locations and timestamp and tokenize the text.
* Fetch more location info from longitude and latitude using reverse_encoder https://github.com/thampiman/reverse-geocoder
* Convert ms timestamps to datetime object.
* Preprocess text using the preprocessor https://github.com/s/preprocessor and remove stopwords using gensim.

In [12]:
coordinates = list(df[['latitude','longitude']].itertuples(index=False, name=None))
locations = rg.search(coordinates)

In [13]:
locations_df = pd.json_normalize(locations)[['name', 'admin1', 'admin2', 'cc']]

In [15]:
df = pd.concat([df, locations_df], axis=1)

In [16]:
def parallelize_dataframe(df, func, n_cores=4):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [17]:
df = df.drop(df[df.text == ''].index)
df = df.drop(df[df.text.isna()].index)
df = df.drop(585341+536) # makes the function p.clean crash...

In [18]:
def add_features(df):
    try:
        df['text'] = df['text'].apply(p.clean)
    except:
        print(df['text'])
    return df
df = parallelize_dataframe(df, add_features)

In [19]:
# make lower case and remove white space, must be done after cleaning
df['text']  = df['text'].str.lower().str.replace('[^\w\s]',' ').str.replace('\s\s+', ' ')

In [20]:
# remove stopwords, must be done after cleaning and removal of white space
# my_func = lambda x: remove_stopwords(x['doc'])
def remove_stopwords_f(df):
    df['text'] = df['text'].apply(remove_stopwords)
    return df
df = parallelize_dataframe(df, remove_stopwords_f)

In [21]:
df = df.drop(df[df.text == ''].index)

In [23]:
df = df.sort_values(by=['timestamp_ms', 'admin1'])

In [24]:
df = df.drop(['admin2'], axis=1)

In [25]:
df

Unnamed: 0,timestamp_ms,longitude,latitude,text,name,admin1,cc
0,1443887012127,-3.703508,40.477795,el bara lleva penaltis favor y el madrid slo v...,Tetuan de las Victorias,Madrid,ES
3,1443887056044,-5.923545,37.383117,envidia,Sevilla,Andalusia,ES
7,1443887073903,-66.578926,6.422820,acuerdo pero otra cosa es que ellos lo sepan c...,Puerto Carreno,Vichada,CO
5,1443887110156,-0.352529,39.422484,tu eres retrasado,Alfafar,Valencia,ES
13,1443887259772,20.951529,52.230058,oblaem si herbat,Ochota,Masovian Voivodeship,PL
...,...,...,...,...,...,...,...
707997,1573501662496,29.005222,41.021321,nasl gzel elolu neler neler yapyor zenmemek el...,UEskuedar,Istanbul,TR
849012,1574143147170,32.663877,39.914185,sunduun habere sen inandn m zor artlarda altnz...,Etimesgut,Ankara,TR
463066,1574422333977,-45.917442,-23.061418,bom dia junko,Sao Jose dos Campos,Sao Paulo,BR
392692,1575247560716,-80.476404,43.430240,radio stationi listen everyday turns day every...,Kitchener,Ontario,CA


In [26]:
fmt = '%d\n%.8f\n%.8f\n%s\n%s\n%s\n%s'
np.savetxt(r'datasetAll.txt', df.values, fmt=fmt, delimiter='\r\n')