# Heckman model

In [48]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [26]:
google_last_df = pd.read_csv("../Data/cleaned_withNA.csv",encoding='latin_1')

#Run the line below if you start with the full data set
google_df = google_last_df.copy()

We create a feature indicating whether there is a NA present in nb_rating:

In [27]:
google_df['NA_rating'] = google_df.rating_app.isna()
google_df[['NA_rating', 'rating_app']].head()

Unnamed: 0,NA_rating,rating_app
0,False,3.0
1,False,4.3
2,False,4.3
3,True,
4,True,


We search for what might be a good feature to predict a NA in app rating:

In [28]:
#looking at the number of ratings
print('all app ratings: ')
print(google_df[['nb_rating']].describe())
print('')
print('number of app ratings with NA app rating: ')
print(google_df[google_df.rating_app.isna()][['nb_rating']].describe())
#Here we conclude that in all cases where app rating is missing, the number of ratings are also missing. It might be a good indicator to use for Heckman, but I'm unsure if it is allowed..

all app ratings: 
          nb_rating
count  7.756930e+05
mean   6.115331e+03
std    2.472945e+05
min    1.000000e+00
25%    8.000000e+00
50%    4.100000e+01
75%    2.770000e+02
max    8.858200e+07

number of app ratings with NA app rating: 
       nb_rating
count        0.0
mean         NaN
std          NaN
min          NaN
25%          NaN
50%          NaN
75%          NaN
max          NaN


In [29]:
#looking at number of downloads
print('number of app downloads with NA app rating: ')
print(google_df[['num_downloads_cat']].describe())
print('----------------------------------------------')
print('number of app downloads with NA app rating: ')
print(google_df[google_df.rating_app.isna()][['num_downloads_cat']].describe())
#Here we can see that number of downloads seem to be lower when there is a missing value in rating. We can investigate further.

number of app downloads with NA app rating: 
       num_downloads_cat
count             844233
unique                 6
top          1000 - 9999
freq              239492
----------------------------------------------
number of app downloads with NA app rating: 
       num_downloads_cat
count              68566
unique                 6
top               0 - 99
freq               45532


In [30]:
google_df[google_df.rating_app.isna()].num_downloads_cat.value_counts()

0 - 99             45532
100 - 999          21596
1000 - 9999         1302
10000 - 99999         98
100000 - 999999       33
1000000 +              5
Name: num_downloads_cat, dtype: int64

In [31]:
google_df.num_downloads_cat.value_counts()

1000 - 9999        239492
10000 - 99999      207768
100 - 999          171715
100000 - 999999    101084
0 - 99              91122
1000000 +           33052
Name: num_downloads_cat, dtype: int64

In [32]:
google_df.rating_app.loc[google_df.num_downloads_cat.isna()].isna().value_counts()

True     1211
False      27
Name: rating_app, dtype: int64

In [33]:
google_df.rating_app.loc[google_df.num_downloads_cat.isna()].value_counts()

5.0    19
3.0     2
4.2     1
3.7     1
4.1     1
3.6     1
3.9     1
1.5     1
Name: rating_app, dtype: int64

Here we see that the change in distribution could be significantly different, and it might be usable as a predictor of NA rating. But that there is also a small problem, there are a few NA's found in the number of downloads, where most do not have a rating as well.

To use the number of downloads we need to encode it as a one-hot encoded variable in order to be able to use it in the model. It is quite diffictult to give good names to the one-hot encoded variables as they are currently coded as numbers. We will use letters to indicate the categories:
a = NA 
b =  0 - 99
c = 100 - 999
d = 1000 - 9999
e = 10000 - 99999
f = 100000 - 999999
g = 1000000 +

In [34]:
google_df["num_downloads_cat"] = google_df["num_downloads_cat"].astype('category')
google_df["downloads_label"] = google_df["num_downloads_cat"].cat.codes
google_df["downloads_label"] = google_df.downloads_label.replace({-1 : 'a', 0 : 'b', 1 : 'c', 2 : 'd', 3 : 'e', 4 : 'f', 5: 'g'})
google_df["downloads_label"] = google_df.downloads_label.apply(lambda x: 'downloads_' + x)

google_df["downloads_numlabel"] = google_df["downloads_label"].astype('category').cat.codes

print(google_df[['num_downloads_cat', 'downloads_label', 'downloads_numlabel']].head(n=10))
print("")
print("---- NA's ----")
print(google_df[['num_downloads_cat', 'downloads_label', 'downloads_numlabel']].loc[google_df.num_downloads_cat.isna()].head(n=2))
print('--------------')

OH_downloads = pd.get_dummies(google_df['downloads_label'])
google_df = google_df.join(OH_downloads);

  num_downloads_cat downloads_label  downloads_numlabel
0         100 - 999     downloads_c                   2
1         100 - 999     downloads_c                   2
2         1000000 +     downloads_g                   6
3            0 - 99     downloads_b                   1
4            0 - 99     downloads_b                   1
5            0 - 99     downloads_b                   1
6            0 - 99     downloads_b                   1
7            0 - 99     downloads_b                   1
8         100 - 999     downloads_c                   2
9            0 - 99     downloads_b                   1

---- NA's ----
    num_downloads_cat downloads_label  downloads_numlabel
342               NaN     downloads_a                   0
687               NaN     downloads_a                   0
--------------


In [42]:
google_df = google_df.drop(['permissions'], axis=1)

In [49]:
def normalise_function(feature):
    x = google_df[[feature]].values.astype(float)
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df_normalized = pd.DataFrame(x_scaled)
    google_df[feature + '_normalized'] = df_normalized

#sns.displot(google_df, x=google_df['nb_apps_developer'], bins =10)
normalise_function('nb_apps_developer')
normalise_function('age')
normalise_function('nb_rating')
normalise_function('nb_screenshots')
normalise_function('nb_permissions')

In [51]:
google_df.to_stata('../data/heckman.dta') 

In [50]:
[print(column) for column in google_df.columns];

my_app_id
rating_app
nb_rating
num_downloads
price_gplay
has_ads
in_app
num_downloads_cat
nb_screenshots
year_published
age
PEGI_12
PEGI_16
PEGI_18
PEGI_3
PEGI_7
Parenta
Unrated
nan_content
version
version_1_0
version_1_1
version_1_5
version_1_6
version_2_0
version_2_1
version_2_2
version_2_3
version_3_0
version_3_1
version_3_2
version_4_0
version_4_1
version_4_2
version_4_3
version_4_4
version_5_0
version_5_1
version_6_0
version_7_0
version_7_1
version_8_0
version_Var
version_nan_os
nb_permissions
nb_apps_developer
free_app
freeApp_x_hasAds
freeApp_x_inApp
freeApp_x_hasAds_inApp
Action
ActionANDAdventure
Adventure
Arcade
ArtANDDesign
AutoANDVehicles
Beauty
Board
BooksANDReference
BrainGames
Business
Card
Casino
Casual
Comics
Communication
Creativity
Dating
Education
Educational
Entertainment
Events
Finance
FoodANDDrink
HealthANDFitness
HouseANDHome
LibrariesANDDemo
Lifestyle
MapsANDNavigation
Medical
Music
MusicANDAudio
MusicANDVideo
NewsANDMagazines
Parenting
Personalization
Photogra

# Heckman analysis

In [45]:
google_df.downloads_numlabel

0         2
1         2
2         6
3         1
4         1
         ..
845466    3
845467    4
845468    4
845469    3
845470    4
Name: downloads_numlabel, Length: 845471, dtype: int8

In [91]:
from statsmodels.regression import heckman

model_vars = google_df[['free_app','in_app', 'has_ads',  'PEGI_12', 'PEGI_16', 'PEGI_18', 'PEGI_3', 'PEGI_7', 'Parenta', 'Unrated', 'version_1_0', 'version_1_1', 'version_1_5', 'version_1_6', 'version_2_0', 'version_2_1', 'version_2_2', 'version_2_3', 'version_3_0', 'version_3_1', 'version_3_2', 'version_4_0', 'version_4_1', 'version_4_2', 'version_4_3', 'version_4_4', 'version_5_0', 'version_5_1', 'version_6_0', 'version_7_0', 'version_7_1', 'version_8_0', 'version_Var', 'Action', 'ActionANDAdventure', 'Adventure', 'Arcade', 'AutoANDVehicles', 'Beauty', 'Board', 'BooksANDReference', 'BrainGames', 'Business', 'Card', 'Casino', 'Casual', 'Comics', 'Communication', 'Creativity', 'Dating', 'Education', 'Educational', 'Entertainment', 'Events', 'Finance', 'FoodANDDrink', 'HealthANDFitness', 'HouseANDHome', 'LibrariesANDDemo', 'Lifestyle', 'MapsANDNavigation', 'Medical', 'Music', 'MusicANDAudio', 'MusicANDVideo', 'NewsANDMagazines', 'Parenting', 'Personalization', 'Photography', 'PretendPlay', 'Productivity', 'Puzzle', 'Racing', 'RolePlaying', 'Shopping', 'Simulation', 'Social', 'Sports', 'Strategy', 'Tools', 'TravelANDLocal', 'Trivia', 'VideoPlayersANDEditors', 'Weather', 'Word']]
select_vars = google_df[['downloads_a' + 'downloads_b' + 'downloads_c' + 'downloads_d' + 'downloads_e' + 'downloads_f' + 'downloads_g']]

res = heckman.Heckman(rating_app, X, w_).fit(method='twostep')
print(res.summary())

#Blijkbaar werkt Heckman in python nog niet

ImportError: cannot import name 'heckman' from 'statsmodels.regression' (C:\Users\s159907\Anaconda3\lib\site-packages\statsmodels\regression\__init__.py)

In [17]:
google_df.to_csv('../Data/HeckmanR.csv',index=False)

The rest of the code is in the R notebook