In [1]:
import pandas as pd
import numpy as np
import re
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

# Read in the data

In [2]:
df_1 = pd.read_csv('avalik_1.csv', sep='\t')
df_2 = pd.read_csv('avalik_2.csv', sep='\t')
df_3 = pd.read_csv('avalik_3.csv', sep='\t')

In [3]:
df = df_1.append(df_2).append(df_3)
df = df.drop_duplicates()

In [4]:
df['ToimKpv'] = pd.to_datetime(df['ToimKpv'])
# df = df.loc[df['ToimKpv']>='2019']

# Clean the data

In [5]:
# only keep Tallinn
df = df.loc[df['ValdLinnNimetus'].str.contains('Tallinn', regex=True, na=False)]
df = df.loc[df['KohtNimetus'].str.contains('linnaosa', regex=True, na=False)]

In [6]:
# only interested in mobile phones
df['is_mobile'] = np.where(df['SyndmusTaiendavStatLiik'].str.contains('MOBIIL', na=False), 1, 0)
# df['is_bike'] = np.where(df['SyndmusTaiendavStatLiik'].str.contains('jalgrat', flags=re.IGNORECASE, regex=True, na=False), 1, 0)
# drop roweś were botth bike and mobile are 1
# df = df.loc[~((df['is_mobile']==1)&(df['is_bike']==1))]

In [7]:
# only keep necessary columns
df = df[['ToimKpv', 'ToimKell', 'ToimNadalapaev', 'KohtLiik', 'KohtNimetus', 'is_mobile']]

# translate columns
df = df.rename(columns={'ToimKpv':'date', 'ToimKell':'time', 'ToimNadalapaev':'weekday', 
                        'KohtLiik':'place', 'KohtNimetus':'district'})

for col in ['weekday', 'place', 'district']:
    df[col] = df[col].str.strip()

df = df.dropna()

In [8]:
# remove too generic place values
for value in ['AVALIK_KOHT', 'TANAV_VALJAK', 'MUU KOHT', 'MUU RUUM']:
    df['place'] = df['place'].str.replace(value+',','')
    df['place'] = df['place'].str.replace(','+value,'')

# if there are muliple places, keep the first one
df['place'] = np.where(df['place'].str.contains(','),
                       df['place'].str.extract('^(\w+),')[0],
                       df['place'])

# only keep the top results
top_places = df['place'].value_counts().head(8).index
df = df.loc[df['place'].isin(top_places)]

In [9]:
# df['district'] = np.where(df['district']=='Kesklinna linnaosa','centre','not_centre')

In [10]:
# translated weekdays
df['weekday'] = df['weekday'].map({'Esmaspäev':'Mon', 'Teisipäev':'Tue', 'Kolmapäev':'Wed', 
                                   'Neljapäev':'Thu', 'Reede':'Fri', 'Laupäev':'Sat', 'Pühapäev':'Sun'})
# df['weekday'] = df['weekday'].map({'Esmaspäev':'weekday', 'Teisipäev':'weekday',
#                                    'Kolmapäev':'weekday', 'Neljapäev':'weekday', 
#                                    'Reede':'weekday', 'Laupäev':'weekend', 'Pühapäev':'weekend'})

In [11]:
df['month'] = pd.to_datetime(df['date']).dt.month_name()

df['season'] = df['month'].map({'December':'winter','January':'winter', 'February':'winter',
                                'March':'spring', 'April':'spring', 'May':'spring',
                                'June':'summer', 'July':'summer', 'August':'summer',
                                'September':'autumn', 'October':'autumn', 'November':'autumn'})

df['hour'] = pd.to_datetime(df['time'], format='%H:%M').dt.hour
df['time_of_day'] = np.where((df['hour']>=6)&(df['hour']<=17),'day','night')

df = df.drop(columns=['hour', 'time', 'month'])

In [12]:
df = df.reset_index(drop=True)

In [13]:
print(len(df))
df.head()

7458


Unnamed: 0,date,weekday,place,district,is_mobile,season,time_of_day
0,2020-09-24,Thu,KAUPLUS,Lasnamäe linnaosa,0,autumn,night
1,2020-09-23,Wed,KAUPLUS,Haabersti linnaosa,0,autumn,night
2,2020-09-23,Wed,KAUPLUS,Mustamäe linnaosa,0,autumn,night
3,2020-09-23,Wed,SOOGIKOHT,Põhja-Tallinna linnaosa,1,autumn,night
4,2020-09-23,Wed,TANAV_VALJAK,Kesklinna linnaosa,0,autumn,day


# Regression 2019-2020

In [None]:
df_2019 = df.loc[df['date']>='2019']

In [14]:
df_with_dummies = pd.get_dummies(df, drop_first=True)

In [15]:
features = list(df_with_dummies.columns)
features.remove('is_mobile')
features.remove('date')

In [16]:
df_train, df_test = train_test_split(df_with_dummies, test_size=0.2, random_state=42)

In [17]:
y = df_train['is_mobile']
x1 = df_train[features]

x = sm.add_constant(x1)
regression = sm.Logit(y,x).fit()

regression.summary()

Optimization terminated successfully.
         Current function value: 0.106940
         Iterations 10


0,1,2,3
Dep. Variable:,is_mobile,No. Observations:,5966.0
Model:,Logit,Df Residuals:,5941.0
Method:,MLE,Df Model:,24.0
Date:,"Tue, 29 Sep 2020",Pseudo R-squ.:,0.277
Time:,23:50:02,Log-Likelihood:,-638.01
converged:,True,LL-Null:,-882.43
Covariance Type:,nonrobust,LLR p-value:,3.4549999999999995e-88

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.6447,0.562,-4.702,0.000,-3.747,-1.542
weekday_Mon,-0.1915,0.325,-0.590,0.555,-0.828,0.445
weekday_Sat,0.5948,0.251,2.369,0.018,0.103,1.087
weekday_Sun,0.2900,0.265,1.095,0.274,-0.229,0.809
weekday_Thu,-0.4333,0.334,-1.297,0.195,-1.088,0.222
weekday_Tue,-0.5920,0.349,-1.697,0.090,-1.276,0.092
weekday_Wed,-0.2320,0.320,-0.726,0.468,-0.859,0.395
place_KAUBAMAJA,-1.9286,0.797,-2.421,0.015,-3.490,-0.367
place_KAUPLUS,-2.3934,0.403,-5.941,0.000,-3.183,-1.604


### Removing insignificant features

In [18]:
to_remove = [1]

while len(to_remove) > 0:
    data = regression.pvalues
    
    new_features = list(data.loc[data<0.05].index)
    if 'const' in new_features: new_features.remove('const')
    
    y = df_train['is_mobile']
    x1 = df_train[new_features]

    x = sm.add_constant(x1)
    regression = sm.Logit(y,x).fit()
    
    to_remove = list(data.loc[data>=0.05].index)
    if 'const' in to_remove: to_remove.remove('const')

Optimization terminated successfully.
         Current function value: 0.109471
         Iterations 10
Optimization terminated successfully.
         Current function value: 0.109471
         Iterations 10


In [19]:
y = df_train['is_mobile']
x1 = df_train[new_features]

x = sm.add_constant(x1)
regression = sm.Logit(y,x).fit()

regression.summary()

Optimization terminated successfully.
         Current function value: 0.109471
         Iterations 10


0,1,2,3
Dep. Variable:,is_mobile,No. Observations:,5966.0
Model:,Logit,Df Residuals:,5958.0
Method:,MLE,Df Model:,7.0
Date:,"Tue, 29 Sep 2020",Pseudo R-squ.:,0.2599
Time:,23:50:02,Log-Likelihood:,-653.1
converged:,True,LL-Null:,-882.43
Covariance Type:,nonrobust,LLR p-value:,6.187e-95

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-3.2464,0.143,-22.628,0.000,-3.528,-2.965
weekday_Sat,0.6835,0.174,3.938,0.000,0.343,1.024
place_KAUBAMAJA,-1.6990,0.720,-2.358,0.018,-3.111,-0.287
place_KAUPLUS,-2.2795,0.252,-9.060,0.000,-2.773,-1.786
place_SOOGIKOHT,1.9210,0.181,10.589,0.000,1.565,2.277
place_TANKLA,-2.6135,1.009,-2.591,0.010,-4.591,-0.636
place_UHISSOIDUK,1.7555,0.274,6.413,0.000,1.219,2.292
season_summer,0.6012,0.157,3.838,0.000,0.294,0.908


In [20]:
# const	-3.2464	0.143	-22.628	0.000	-3.528	-2.965
# weekday_Sat	0.6835	0.174	3.938	0.000	0.343	1.024
# place_KAUBAMAJA	-1.6990	0.720	-2.358	0.018	-3.111	-0.287
# place_KAUPLUS	-2.2795	0.252	-9.060	0.000	-2.773	-1.786
# place_SOOGIKOHT	1.9210	0.181	10.589	0.000	1.565	2.277
# place_TANKLA	-2.6135	1.009	-2.591	0.010	-4.591	-0.636
# place_UHISSOIDUK	1.7555	0.274	6.413	0.000	1.219	2.292
# season_summer	0.6012	0.157	3.838	0.000	0.294	0.908

In [21]:
# lesson 229