In [27]:
import pandas as pd
import numpy  as np

df = pd.read_csv('googleplaystore.csv')

# fill in Types since it can be implied from Price
a = df.Type.isna()
for i in range(len(df.Type)):
    if a[i] == True:
        if df.Price[i] == "0":
            df.Type[i] = "Free"
        else:
            df.Type[i] = "Paid"
            
# treat missing values in Content Rating
df["Content Rating"].replace(to_replace=np.nan,value = "missing", inplace=True)

# reserve missing datapoints in Rating as test set
df_test = df.iloc[list(np.where(np.isnan(df.Rating))[0])]

# drop missing values
df.dropna(inplace = True)   #remove missing entries
df.reset_index(drop=True, inplace=True)

# convert the type of Reviews from object to int
df['Reviews'] = df['Reviews'].astype(int)

# preprocess size
df['Size varies'] = df.Size.apply(lambda x: 1 if 'Varies' in x else 0)
df.Size = df.Size.replace('Varies with device','0')
df.Size = df.Size.str.replace('k','e+3')
df.Size = df.Size.str.replace('M','e+6')
df.Size = pd.to_numeric(df.Size)
df.Size = df.Size/1000000

# preprocess price
df.Price = df.Price.apply(lambda x: x.strip('$'))
df.Price = pd.to_numeric(df.Price)

# preprocess genres
split_genres1 = df.Genres.apply(lambda x: x.split(';')[0])
split_genres2 = df.Genres.apply(lambda x: x.split(';')[-1])
df["First Genre"]=split_genres1
df["Second Genre"]=split_genres2

# preprocess last updated
from datetime import datetime,date
update_date=pd.to_datetime(df["Last Updated"])
df["Updated Days"] = update_date.apply(lambda x:date.today()-datetime.date(x))
df["Updated Days"] = df["Updated Days"].astype(int)
df["Updated Days"] = df["Updated Days"] / 864e+11

# preprocess current ver
import re
df["Current Ver"]=df["Current Ver"].apply(lambda x: 'Varies with device' if x=='Varies with device'  else  re.findall('^[0-9]\.[0-9]|[\d]|\W*',str(x))[0])
df['Current ver varies'] = df['Current Ver'].apply(lambda x: 1 if 'Varies' in x else 0)
df["Current Ver"].replace(to_replace="Varies with device",value = "0", inplace=True)
df["Current Ver"].replace(to_replace="",value = "0", inplace=True)
df["Current Ver"] = pd.to_numeric(df["Current Ver"])
uniq_ver = df["Current Ver"].unique()
ord_ver = sorted(uniq_ver)

# preprocess android ver
df['Android ver varies'] = df['Android Ver'].apply(lambda x: 1 if 'Varies' in x else 0)
df["Min Ver"]=df["Android Ver"].apply(lambda x:str(x).split(' and ')[0].split(' - ')[0])
df["Min Ver"]=df["Min Ver"].replace('4.4W','4.4')
df["Max Ver"]=df["Android Ver"].apply(lambda x:str(x).split(' and ')[-1].split(' - ')[-1])
df["Min Ver"].replace(to_replace="Varies with device",value = "0", inplace=True)
df["Max Ver"].replace(to_replace="Varies with device",value = "0", inplace=True)
df["Min Ver"]=df["Min Ver"].apply(lambda x: 'Varies with device' if x=='Varies with device'  else  re.findall('^[0-9]\.[0-9]|[\d]|\W*',str(x))[0])
df["Max Ver"]=df["Max Ver"].apply(lambda x: 'Varies with device' if x=='Varies with device'  else  ('9.0' if 'up' in x else re.findall('^[0-9]\.[0-9]|[\d]|\W*',str(x)))[0])
a1 = df["Min Ver"].unique()
ord_min = sorted(a1)
a2 = df["Max Ver"].unique()
ord_max = sorted(a2)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [28]:
y = df.Rating
df.drop(columns=['Rating'],inplace=True)
X = df

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

def ML_pipeline_Ridge(X,y,random_state):
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state)
    test_scores = []
    alpha_opt = []
    kf = KFold(n_splits=5,shuffle=True,random_state=random_state)

    for train_index, CV_index in kf.split(X_other,y_other):
        X_train, X_CV = X_other.iloc[train_index.tolist()], X_other.iloc[CV_index.tolist()]
        y_train, y_CV = y_other.iloc[train_index.tolist()], y_other.iloc[CV_index.tolist()]

        con_ftrs = ['Reviews','Size','Price','Updated Days']
        onehot_ftrs = ['Category','Size varies','Type','Content Rating','First Genre','Second Genre',\
                       'Current Ver','Current ver varies','Min Ver','Max Ver','Android ver varies']
        ord_ftrs = ['Installs']
        ord_cats = [['1+','5+','10+','50+','100+','500+','1,000+','5,000+','10,000+','50,000+','100,000+','500,000+',\
                     '1,000,000+','5,000,000+','10,000,000+','50,000,000+','100,000,000+','500,000,000+','1,000,000,000+']]

        ohe = OneHotEncoder(sparse=False, categories='auto', handle_unknown='ignore')
        orde = OrdinalEncoder(categories = ord_cats)
        scaler = StandardScaler()

        X_train_con = pd.DataFrame(data=scaler.fit_transform(X_train[con_ftrs]), columns = con_ftrs)
        X_train_onehot = pd.DataFrame(data=ohe.fit_transform(X_train[onehot_ftrs]), columns = ohe.get_feature_names())
        X_train_ord = pd.DataFrame(data=orde.fit_transform(X_train[ord_ftrs]), columns = ord_ftrs)

        X_c_con = pd.DataFrame(data=scaler.transform(X_CV[con_ftrs]), columns = con_ftrs)
        X_c_onehot = pd.DataFrame(data=ohe.transform(X_CV[onehot_ftrs]), columns = ohe.get_feature_names())
        X_c_ord = pd.DataFrame(data=orde.transform(X_CV[ord_ftrs]), columns = ord_ftrs)

        X_t_con = pd.DataFrame(data=scaler.transform(X_test[con_ftrs]), columns = con_ftrs)
        X_t_onehot = pd.DataFrame(data=ohe.transform(X_test[onehot_ftrs]), columns = ohe.get_feature_names())
        X_t_ord = pd.DataFrame(data=orde.transform(X_test[ord_ftrs]), columns = ord_ftrs)

        X_train_df = pd.concat([X_train_con, X_train_onehot, X_train_ord],axis = 1)
        X_c_df = pd.concat([X_c_con, X_c_onehot, X_c_ord],axis = 1)
        X_t_df = pd.concat([X_t_con, X_t_onehot, X_t_ord],axis = 1)

        # tune lasso hyper-parameter, alpha
        alpha = np.logspace(-5,3,num=20)
        CV_score = []
        regs = []
        for a in alpha:
            reg = Ridge(alpha = a)
            reg.fit(X_train_df,y_train)
            CV_score.append(reg.score(X_c_df, y_CV))
            regs.append(reg)

        # find the best alpha in this fold
        alpha_opt.append(alpha[np.argmax(CV_score)])
        # grab the best model
        reg = regs[np.argmax(CV_score)]
        # calculate test score using thee best model
        test_scores.append(reg.score(X_t_df, y_test))

    best_alpha = alpha_opt[np.argmax(test_scores)]
    return best_alpha, test_scores

In [16]:
ML_pipeline_Ridge(X,y,42)

(54.555947811685144,
 [0.057015784601311965,
  0.06349610449222631,
  0.05043564963310964,
  0.05470348104103173,
  0.06324519977035159])

In [35]:
test_scores = []

for i in range(10):
    best_alpha, test_score = ML_pipeline_Ridge(X,y,i*42)
    test_scores.append(test_score)
    print('random_state:', i*42, 'best_alpha:', best_alpha)

print('test score:',np.around(np.mean(test_scores),3),'+/-',np.around(np.std(test_scores),3))

random_state: 0 best_alpha: 7.847599703514606
random_state: 42 best_alpha: 54.555947811685144
random_state: 84 best_alpha: 2.976351441631313
random_state: 126 best_alpha: 20.6913808111479
random_state: 168 best_alpha: 20.6913808111479
random_state: 210 best_alpha: 54.555947811685144
random_state: 252 best_alpha: 20.6913808111479
random_state: 294 best_alpha: 20.6913808111479
random_state: 336 best_alpha: 143.844988828766
random_state: 378 best_alpha: 54.555947811685144
test score: 0.065 +/- 0.012


In [25]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

def ML_pipeline_RF(X,y,random_state):
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state)
    test_scores = []
    bestpara_opt = []
    kf = KFold(n_splits=5,shuffle=True,random_state=random_state)

    for train_index, CV_index in kf.split(X_other,y_other):
        X_train, X_CV = X_other.iloc[train_index.tolist()], X_other.iloc[CV_index.tolist()]
        y_train, y_CV = y_other.iloc[train_index.tolist()], y_other.iloc[CV_index.tolist()]
        
        con_ftrs = ['Reviews','Size','Price','Updated Days']
        onehot_ftrs = ['Category','Size varies','Type','Content Rating','First Genre','Second Genre',\
                       'Current Ver','Current ver varies','Min Ver','Max Ver','Android ver varies']
        ord_ftrs = ['Installs']
        ord_cats = [['1+','5+','10+','50+','100+','500+','1,000+','5,000+','10,000+','50,000+','100,000+','500,000+',\
                     '1,000,000+','5,000,000+','10,000,000+','50,000,000+','100,000,000+','500,000,000+','1,000,000,000+']]

        ohe = OneHotEncoder(sparse=False, categories='auto', handle_unknown='ignore')
        orde = OrdinalEncoder(categories = ord_cats)
        scaler = StandardScaler()

        X_train_con = pd.DataFrame(data=scaler.fit_transform(X_train[con_ftrs]), columns = con_ftrs)
        X_train_onehot = pd.DataFrame(data=ohe.fit_transform(X_train[onehot_ftrs]), columns = ohe.get_feature_names())
        X_train_ord = pd.DataFrame(data=orde.fit_transform(X_train[ord_ftrs]), columns = ord_ftrs)

        X_c_con = pd.DataFrame(data=scaler.transform(X_CV[con_ftrs]), columns = con_ftrs)
        X_c_onehot = pd.DataFrame(data=ohe.transform(X_CV[onehot_ftrs]), columns = ohe.get_feature_names())
        X_c_ord = pd.DataFrame(data=orde.transform(X_CV[ord_ftrs]), columns = ord_ftrs)

        X_t_con = pd.DataFrame(data=scaler.transform(X_test[con_ftrs]), columns = con_ftrs)
        X_t_onehot = pd.DataFrame(data=ohe.transform(X_test[onehot_ftrs]), columns = ohe.get_feature_names())
        X_t_ord = pd.DataFrame(data=orde.transform(X_test[ord_ftrs]), columns = ord_ftrs)

        X_train_df = pd.concat([X_train_con, X_train_onehot, X_train_ord],axis = 1)
        X_c_df = pd.concat([X_c_con, X_c_onehot, X_c_ord],axis = 1)
        X_t_df = pd.concat([X_t_con, X_t_onehot, X_t_ord],axis = 1)

        # tune hyper-parameter
        depth = [d for d in range(1,10)]
        features = ['auto','sqrt','log2']
        dep_fea = [(d, f) for d in depth for f in features]
        CV_score_rf = []
        rfs = []
        for d, f in dep_fea:
            clf = RandomForestRegressor(n_estimators=100,max_depth=d,max_features=f,random_state=random_state)
            clf.fit(X_train_df,y_train)
            CV_score_rf.append(clf.score(X_c_df,y_CV))
            rfs.append(clf)
        
        best_rf = max(CV_score_rf)
        bestpara_opt.append(dep_fea[CV_score_rf.index(best_rf)])
        
        # grab the best model
        rf = rfs[np.argmax(CV_score_rf)]
        # calculate test score using the best model
        test_scores.append(rf.score(X_t_df, y_test))
    best_para = bestpara_opt[np.argmax(test_scores)]
    return best_para, test_scores

In [26]:
ML_pipeline_RF(X,y,42)

((9, 'auto'),
 [0.1493370465357815,
  0.1353549857102454,
  0.14961132970489732,
  0.1446092093317335,
  0.12777972564636275])