# Create a syntetic dataset

In [1]:
import pandas as pd
import numpy as np
import reverse_geocoder as rg
import preprocessor as p
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
import math
from multiprocessing import  Pool

In [2]:
FMT = 4
N = 1000

In [3]:
# Define two tweets that represent two topics
topic1 = remove_stopwords("parmesan is the best italian food ingredient my food life consists of italian parmesan tomatoes beef and parmesan lets eat all the food")
topic1Period = 7
topic1Offset = 0

topic2 = remove_stopwords("football is my life kick a ball in the goal the goal is to win win win sports football is a sports lets kick")
topic2Period = 15
topic2Offset = 4

In [4]:
# Import dataset without patterns
df = pd.read_csv('dataset1000En.csv', encoding='cp1252')
df = df.drop(["Unnamed: 0"], axis=1)

In [5]:
# Insert periodic topics
# Topic 1: Insert at two locations
# Set topic text
df.loc[topic1Offset*FMT, "text"] = topic1
df.loc[(topic1Offset+1)*FMT, "text"] = topic1
# Store rows with location info per location
row1loc1 = df.iloc[topic1Offset*FMT]
row1loc2 = df.iloc[(topic1Offset+1)*FMT]
# Set rows periodically to containt the same text and location
for i in range(topic1Offset*FMT, N, topic1Period*FMT):
    df.iloc[i] = row1loc1
    df.iloc[i+1] = row1loc1
    
    df.iloc[i+FMT] = row1loc2
    df.iloc[i+FMT+1] = row1loc2


# Topic 2: Insert at four locations, same procedure as topic 1. 
# Set in the last two timestamps per day to avoid crashing with topic 1.
df.loc[topic2Offset*FMT+2, "text"] = topic2
df.loc[(topic2Offset+2)*FMT+2, "text"] = topic2
df.loc[(topic2Offset+3)*FMT+2, "text"] = topic2
df.loc[(topic2Offset+7)*FMT+2, "text"] = topic2
row2loc1 = df.iloc[topic2Offset*FMT+2]
row2loc2 = df.iloc[(topic2Offset+2)*FMT+2]
row2loc3 = df.iloc[(topic2Offset+3)*FMT+2]
row2loc4 = df.iloc[(topic2Offset+7)*FMT+2]
for i in range(topic2Offset * FMT, N, topic2Period * FMT):
    try:
        df.iloc[i+2] = row2loc1
        df.iloc[i+3] = row2loc1

        df.iloc[i+(2*FMT)+2] = row2loc2
        df.iloc[i+(2*FMT)+3] = row2loc2

        df.iloc[i+(3*FMT)+2] = row2loc3
        df.iloc[i+(3*FMT)+3] = row2loc3

        df.iloc[i+(7*FMT)+2] = row2loc4
        df.iloc[i+(7*FMT)+3] = row2loc4
    except:
        print(i)

In [6]:
# Set new timestamps, generate FMT timestamps per day, frequency = 24/FMTH. Convert to milliseconds
freq = str(24//FMT) + "H"
dti = pd.date_range("2018-01-01", periods=N, freq=freq)
df["timestamp_date"] = dti
df['timestamp_ms'] = df['timestamp_date'].astype(np.int64) / int(1e6)
df.timestamp_ms = df.timestamp_ms.astype(int)
df = df.drop(['timestamp_date'], axis=1)

In [7]:
df

Unnamed: 0,timestamp_ms,longitude,latitude,text,name,admin1,cc
0,1514764800000,-66.578926,6.422820,parmesan best italian food ingredient food lif...,Puerto Carreno,Vichada,CO
1,1514786400000,-66.578926,6.422820,parmesan best italian food ingredient food lif...,Puerto Carreno,Vichada,CO
2,1514808000000,-122.228685,37.791994,football life kick ball goal goal win win win ...,Alameda,California,US
3,1514829600000,-122.228685,37.791994,football life kick ball goal goal win win win ...,Alameda,California,US
4,1514851200000,21.060741,52.232836,parmesan best italian food ingredient food lif...,Praga Poludnie,Masovian Voivodeship,PL
...,...,...,...,...,...,...,...
995,1536256800000,-79.272569,43.629311,tourtoronto tweeryourseat kidding sec row seats,Scarborough,Ontario,CA
996,1536278400000,-60.029848,-37.147576,s oh quiet emabiggestfansjustinbieber,Olavarria,Buenos Aires,AR
997,1536300000000,-43.441578,-22.911422,cem rts cem vote pelo justin iwannahearwdymons...,Nilopolis,Rio de Janeiro,BR
998,1536321600000,-60.029848,-37.147576,u emabiggestfansjustinbieber,Olavarria,Buenos Aires,AR


In [8]:
fmt = '%d\n%.8f\n%.8f\n%s\n%s\n%s\n%s'
np.savetxt(r'datasetSynth1000.txt', df.values, fmt=fmt, delimiter='\r\n')

In [213]:
row1loc1

timestamp_ms                                        1443887442007
longitude                                                -66.5789
latitude                                                  6.42282
text            parmesan best italian food ingredient food lif...
name                                               Puerto Carreno
admin1                                                    Vichada
cc                                                             CO
Name: 0, dtype: object

In [214]:
row1loc2

timestamp_ms                                        1443888128338
longitude                                                 21.0607
latitude                                                  52.2328
text            parmesan best italian food ingredient food lif...
name                                               Praga Poludnie
admin1                                       Masovian Voivodeship
cc                                                             PL
Name: 4, dtype: object

In [215]:
row2loc1

timestamp_ms                                        1443889322941
longitude                                                  121.45
latitude                                                  14.2528
text            football life kick ball goal goal win win win ...
name                                                   Cabanbanan
admin1                                                 Calabarzon
cc                                                             PH
Name: 18, dtype: object

In [216]:
row2loc2

timestamp_ms                                        1443889758843
longitude                                               -0.350693
latitude                                                  51.4617
text            football life kick ball goal goal win win win ...
name                                                     Hounslow
admin1                                                    England
cc                                                             GB
Name: 26, dtype: object

In [217]:
row2loc3

timestamp_ms                                        1443890179277
longitude                                                  120.97
latitude                                                  14.4305
text            football life kick ball goal goal win win win ...
name                                                    Las Pinas
admin1                                                 Calabarzon
cc                                                             PH
Name: 30, dtype: object

In [218]:
row2loc4

timestamp_ms                                        1443890918801
longitude                                                 55.6954
latitude                                                  25.3163
text            football life kick ball goal goal win win win ...
name                                                    Adh Dhayd
admin1                                               Ash Shariqah
cc                                                             AE
Name: 46, dtype: object