In [1]:
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
from sklearn.ensemble import RandomForestRegressor
import sklearn.preprocessing as preprocessing
from sklearn import linear_model
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve, cross_val_score, train_test_split

# 1. Data Pre-processing

### Fill NaN

In [2]:
train_raw_data = pd.read_csv("raw_data/train.csv",parse_dates=['purchase_date','release_date'])
train_raw_data['purchase_date'] = train_raw_data.purchase_date.fillna(method='backfill')
train_raw_data['total_positive_reviews'] = train_raw_data.total_positive_reviews.fillna(method='backfill')
train_raw_data['total_negative_reviews'] = train_raw_data.total_negative_reviews.fillna(method='backfill')

In [3]:
test_raw_data = pd.read_csv("raw_data/test.csv",parse_dates=['purchase_date','release_date'])
test_raw_data['purchase_date'] = test_raw_data.purchase_date.fillna(method='backfill')
test_raw_data['total_positive_reviews'] = test_raw_data.total_positive_reviews.fillna(method='backfill')
test_raw_data['total_negative_reviews'] = test_raw_data.total_negative_reviews.fillna(method='backfill')

# 2. Feature Engineering 

### 2.1 Extract date feature

In [4]:
# define date-extracting function
def extract_date(df_copy,df,column):
    df_copy[column+'_year']=df[column].apply(lambda x: x.year)
    df_copy[column+'_month']=df[column].apply(lambda x: x.month)
    df_copy[column+'_day']=df[column].apply(lambda x: x.day)
    return df_copy

In [5]:
# for training set
# extract purchase_date
train_raw_copy = train_raw_data.copy()
train_extract_purchase_date = extract_date(train_raw_copy, train_raw_data, 'purchase_date')

# extract release_date
train_extract_purchase_date_copy = train_extract_purchase_date.copy()
train_extract_date = extract_date(train_extract_purchase_date_copy, train_extract_purchase_date, 'release_date')

import datetime
train_extract_date['date_interval'] = (train_extract_date['purchase_date']-train_extract_date['release_date'])\
.apply(lambda x: x.days)

del train_raw_copy,train_extract_purchase_date,train_extract_purchase_date_copy

In [6]:
# for test set
# extract purchase_date
test_raw_copy = test_raw_data.copy()
test_extract_purchase_date = extract_date(test_raw_copy, test_raw_data, 'purchase_date')

# extract release_date
test_extract_purchase_date_copy = test_extract_purchase_date.copy()
test_extract_date = extract_date(test_extract_purchase_date_copy, test_extract_purchase_date, 'release_date')

import datetime
test_extract_date['date_interval'] = (test_extract_date['purchase_date']-test_extract_date['release_date'])\
.apply(lambda x: x.days)

del test_raw_copy,test_extract_purchase_date,test_extract_purchase_date_copy

### 2.2 Extract category feature

In [7]:
train_categories_one_hot = train_raw_data["categories"].str.get_dummies(",") 
test_categories_one_hot = test_raw_data["categories"].str.get_dummies(",") 
categories_train_diff_test = train_categories_one_hot.columns.difference(test_categories_one_hot.columns)
categories_test_diff_train = test_categories_one_hot.columns.difference(train_categories_one_hot.columns)
print(categories_train_diff_test)
print(categories_test_diff_train)

Index(['Valve Anti-Cheat enabled'], dtype='object')
Index([], dtype='object')


In [8]:
train_categories_one_hot = train_categories_one_hot.drop(columns=list(categories_train_diff_test),axis=1)

### 2.3 Extract genre feature

In [9]:
train_genres_one_hot = train_raw_data["genres"].str.get_dummies(",") 
test_genres_one_hot = test_raw_data["genres"].str.get_dummies(",") 
genres_train_diff_test = train_genres_one_hot.columns.difference(test_genres_one_hot.columns)
genres_test_diff_train = test_genres_one_hot.columns.difference(train_genres_one_hot.columns)
print(genres_train_diff_test)
print(genres_test_diff_train)

Index(['Animation & Modeling', 'Audio Production', 'Design & Illustration',
       'Racing', 'Sexual Content', 'Utilities'],
      dtype='object')
Index([], dtype='object')


In [10]:
train_genres_one_hot = train_genres_one_hot.drop(columns=list(genres_train_diff_test),axis=1)

### 2.4 Extract tag feature

In [11]:
train_tags_one_hot = train_raw_data["tags"].str.get_dummies(",") 
test_tags_one_hot = test_raw_data["tags"].str.get_dummies(",") 
tags_train_diff_test = train_tags_one_hot.columns.difference(test_tags_one_hot.columns)
tags_test_diff_train = test_tags_one_hot.columns.difference(train_tags_one_hot.columns)
print(tags_train_diff_test)
print(tags_test_diff_train)

Index(['3D', 'ATV', 'Addictive', 'Animation & Modeling',
       'Artificial Intelligence', 'Audio Production', 'Automation', 'Batman',
       'Battle Royale', 'Bikes', 'Board Game', 'Bullet Hell', 'Capitalism',
       'Card Game', 'Cartoon', 'Cats', 'Character Action Game', 'Chess',
       'Choose Your Own Adventure', 'Clicker', 'Co-op Campaign', 'Comic Book',
       'Conspiracy', 'Dark Comedy', 'Design & Illustration', 'Documentary',
       'Dungeons & Dragons', 'Experience', 'Flight', 'Game Development',
       'God Game', 'Gothic', 'Gun Customization', 'Hidden Object', 'Horses',
       'Immersive Sim', 'Intentionally Awkward Controls', 'Investigation',
       'LGBTQ+', 'Lara Croft', 'Logic', 'MMORPG', 'Mars', 'Martial Arts',
       'Metroidvania', 'Motocross', 'Motorbike', 'Movie', 'Multiple Endings',
       'Naval', 'Offroad', 'Perma Death', 'Pirates', 'Programming', 'PvE',
       'Quick-Time Events', 'Racing', 'Rome', 'Sailing', 'Satire',
       'Score Attack', 'Sequel', 'Sniper',

In [12]:
train_tags_one_hot = train_tags_one_hot.drop(columns=list(tags_train_diff_test),axis=1)
test_tags_one_hot = test_tags_one_hot.drop(columns=list(tags_test_diff_train),axis=1)

### 2.5 Feature Concat 

In [13]:
train = pd.concat([train_extract_date, train_categories_one_hot,train_genres_one_hot,train_tags_one_hot],axis=1)
test = pd.concat([test_extract_date, test_categories_one_hot,test_genres_one_hot,test_tags_one_hot],axis=1)
print('train shape',train.shape)
print('test shape',test.shape)

train shape (357, 284)
test shape (90, 283)


### 2.6 Process repeated columns between tags, categories and genres

In [14]:
# for training set
set_tag = set(train_tags_one_hot.columns)
set_genre = set(train_genres_one_hot.columns)
set_category = set(train_categories_one_hot.columns)

set_1 = set_tag & set_genre
set_2 = set_category & set_tag
set_3 = set_category & set_genre
set_all = set_1 | set_2 | set_3
print('repeated columns：',set_all)
for col in set_all:
    tmp = train[[col]]
    tmp.columns=[col+'_1',col+'_2']
    tmp[col]=0
    tmp.loc[(tmp[col+'_1']==1) | (tmp[col+'_2']==1),[col]]=1
    train = train.drop([col],axis=1)
    tmp = tmp.drop([col+'_1',col+'_2'],axis=1)
    train = pd.concat([train,tmp],axis=1)
del set_1,set_2,set_3,set_all,tmp

repeated columns： {'RPG', 'Simulation', 'Sports', 'Early Access', 'Gore', 'Massively Multiplayer', 'Co-op', 'Action', 'Casual', 'Strategy', 'Free to Play', 'Nudity', 'Violent', 'Indie', 'Adventure'}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [15]:
# for test set
set_tag = set(test_tags_one_hot.columns)
set_genre = set(test_genres_one_hot.columns)
set_category = set(test_categories_one_hot.columns)
set_1 = set_tag & set_genre
set_2 = set_category & set_tag
set_3 = set_category & set_genre
set_all = set_1 | set_2 | set_3
print('repeated columns：',set_all)
for col in set_all:
    tmp = test[[col]]
    tmp.columns=[col+'_1',col+'_2']
    tmp[col]=0
    tmp.loc[(tmp[col+'_1']==1) | (tmp[col+'_2']==1),[col]]=1
    test = test.drop([col],axis=1)
    tmp = tmp.drop([col+'_1',col+'_2'],axis=1)
    test = pd.concat([test,tmp],axis=1)
del set_1,set_2,set_3,set_all,tmp

repeated columns： {'RPG', 'Simulation', 'Sports', 'Early Access', 'Gore', 'Massively Multiplayer', 'Co-op', 'Action', 'Casual', 'Strategy', 'Free to Play', 'Nudity', 'Violent', 'Indie', 'Adventure'}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [16]:
print('train shape',train.shape)
print('test shape',test.shape)

train shape (357, 269)
test shape (90, 268)


# 3. Regression：predict play time

### 3.1 1st Training (using all features)

In [17]:
train_input = train.drop(['categories','genres','tags','purchase_date','release_date','Free to Play'],axis=1)
train_x = train_input.drop(['playtime_forever','id'],axis=1)
train_y = train_input[['playtime_forever']]
print('train_x shape',train_x.shape)

train_x shape (357, 261)


In [18]:
####model####
from sklearn import ensemble
model = ensemble.GradientBoostingRegressor()
model.fit(train_x,train_y)

from sklearn.model_selection import cross_val_score
score = np.sqrt(-cross_val_score(model, train_x, train_y, cv=50,scoring='neg_mean_squared_error'))
mean_score = np.mean(score)
print(score)
print('mean_score:',mean_score)
del score,mean_score

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[ 8.19597283  4.40797164  4.97348248  1.42683227 16.06717865  3.59019004
  4.71482776  1.06975931  3.10718496  2.36401532  2.88685344  9.51086093
  1.67367704  6.92577443  1.03001332  1.60664983  4.55317582  1.37661117
  2.1527271   2.24603421  0.74788984  5.10963243 19.69863738  6.53547843
 10.16662253  3.74675557  1.64111652 32.30050968 14.01900229 11.24154207
 15.31875101  5.61972654 10.88185473  7.27400734  1.67900771  2.04727099
  3.20943599 24.01347671 39.22997165 17.5648183   1.58129473  1.53573728
  2.05128604 12.48547573  1.33082153  0.94729901  0.99303255  1.91737981
 22.3512092   1.30345869]
mean_score: 7.248445896724051


  y = column_or_1d(y, warn=True)


### 3.2 Feature selection

In [19]:
# feature importance
feature_importance = pd.DataFrame()
feature_importance["feature"] = train_x.columns
feature_importance["importance"] = model.feature_importances_
feature_importance = feature_importance.sort_values("importance", ascending=False)
feature_importance.reset_index(inplace=True)

In [20]:
feature_importance

Unnamed: 0,index,feature,importance
0,88,Difficult,0.199980
1,2,total_positive_reviews,0.189262
2,10,date_interval,0.088758
3,3,total_negative_reviews,0.073376
4,35,Steam Workshop,0.040746
5,6,purchase_date_day,0.038253
6,41,2D,0.034804
7,18,Local Co-op,0.029049
8,22,Online Co-op,0.024949
9,28,Shared/Split Screen,0.023547


In [21]:
select_feats = feature_importance[feature_importance["importance"]>0.001]
select_feats = list(select_feats.feature.values)
print('num of selected features: ',len(select_feats))
print('selected features: ',select_feats)

num of selected features:  40
selected features:  ['Difficult', 'total_positive_reviews', 'date_interval', 'total_negative_reviews', 'Steam Workshop', 'purchase_date_day', '2D', 'Local Co-op', 'Online Co-op', 'Shared/Split Screen', 'price', 'purchase_date_month', 'Party-Based RPG', 'Mouse only', 'Turn-Based Combat', 'Turn-Based', 'Early Access', 'Pixel Graphics', 'release_date_day', 'Replay Value', 'Third Person', 'purchase_date_year', 'RPG', 'Cartoony', 'Alternate History', 'Dungeon Crawler', 'release_date_year', 'Cross-Platform Multiplayer', 'Remote Play on TV', 'Crafting', 'Hunting', 'CRPG', 'Family Friendly', 'Rogue-like', 'Zombies', 'In-App Purchases', 'Building', 'Open World', 'Based On A Novel', 'Massively Multiplayer']


### 3.3 2nd Training (using selected features)

In [22]:
train_x = train_x[select_feats]
print('train_x shape',train_x.shape)

train_x shape (357, 40)


In [23]:
####model####
from sklearn import ensemble
model = ensemble.GradientBoostingRegressor()
model.fit(train_x,train_y)

from sklearn.model_selection import cross_val_score
score = np.sqrt(-cross_val_score(model, train_x, train_y, cv=50,scoring='neg_mean_squared_error'))
mean_score = np.mean(score)
print(score)
print('mean_score:',mean_score)
del score,mean_score

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[ 8.38667802  2.86108426  4.59613285  2.4223982  14.29948409  3.15291275
  4.63036565  0.92566725  2.84811557  2.30494109  2.92901083  9.74927481
  2.05815784 11.57302496  0.74743994  1.53880026  3.97178119  1.70833972
  1.43680164  2.19721829  0.66379128  4.92663824 17.89609815  6.46674146
 13.48496914  4.42722647  2.09545002 30.24848029 15.63234853  9.60600571
 15.46425219  6.44763992 10.50654843  6.12002698  1.6402945   1.84875922
  2.31370409 20.1558516  38.45502473  3.6170004   0.44953747  1.57900631
  1.55698334 11.65981351  1.41179632  1.11702007  1.62658503  2.23309706
 21.59577342  1.89259299]
mean_score: 6.829533721766211


### 3.4 regression prediction

In [24]:
test_x = test.drop(['categories','genres','tags','purchase_date','release_date','id','Free to Play'],axis=1)
test_x = test_x[select_feats]
test_x = test_x[train_x.columns]  
print('test_x shape:',test_x.shape)

test_x shape: (90, 40)


In [25]:
test_y_pred = model.predict(test_x)
test["playtime_forever"] = test_y_pred
result=test[['id','playtime_forever']]
result.sort_values("playtime_forever",inplace=False)

Unnamed: 0,id,playtime_forever
48,48,-1.523937
51,51,-0.638815
40,40,-0.584056
16,16,-0.412489
3,3,-0.398775
26,26,-0.321883
67,67,-0.301103
2,2,-0.276230
22,22,-0.100584
7,7,-0.051978


# 4. Store final result

In [26]:
result.loc[result['playtime_forever']<=0,['playtime_forever']] =0
result.sort_values("playtime_forever",inplace=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,id,playtime_forever
41,41,0.000000
2,2,0.000000
3,3,0.000000
67,67,0.000000
51,51,0.000000
48,48,0.000000
7,7,0.000000
16,16,0.000000
22,22,0.000000
40,40,0.000000


In [27]:
result.to_csv('result/submit_1130_2_2.csv',index=0,header=1)