In [31]:
import numpy as np
import pandas as pd
import csv
from sklearn.externals import joblib

In [32]:
# read in train dataset and test dataset
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train_size = train.shape[0]
test_size = test.shape[0]
print("train_size:%d" % (train_size))
print("test_size:%d" % (test_size))

train_size:357
test_size:90


In [33]:
# detecting missing values
for columname in train.columns:
    if train[columname].count() != len(train):
        loc = train[columname][train[columname].isnull().values==True].index.tolist()
        print('训练集：{}, 第{}行位置有缺失值'.format(columname,loc))
for columname in test.columns:
    if test[columname].count() != len(test):
        loc = test[columname][test[columname].isnull().values==True].index.tolist()
        print('测试集：{}, 第{}行位置有缺失值'.format(columname,loc))

# missing value processing
# drop two columns in train dataset and two columns in test dataset
train.dropna(axis=0, how='any', inplace=True)
test.dropna(axis=0, how='any', inplace=True)
train_size = train.shape[0]
test_size = test.shape[0]
print("train_size:%d" % (train_size))
print("test_size:%d" % (test_size))

训练集：purchase_date, 第[5, 76]行位置有缺失值
训练集：total_positive_reviews, 第[5, 76]行位置有缺失值
训练集：total_negative_reviews, 第[5, 76]行位置有缺失值
测试集：purchase_date, 第[12, 45]行位置有缺失值
测试集：total_positive_reviews, 第[12, 45]行位置有缺失值
测试集：total_negative_reviews, 第[12, 45]行位置有缺失值
train_size:355
test_size:88


In [34]:
# create a dataframe containing training and test sets, drop column ['id']
y_train = train['playtime_forever']
train = train[test.columns]
df = train.append(test)
df.drop(['id'], axis = 1, inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 443 entries, 0 to 89
Data columns (total 9 columns):
is_free                   443 non-null bool
price                     443 non-null float64
genres                    443 non-null object
categories                443 non-null object
tags                      443 non-null object
purchase_date             443 non-null object
release_date              443 non-null object
total_positive_reviews    443 non-null float64
total_negative_reviews    443 non-null float64
dtypes: bool(1), float64(3), object(5)
memory usage: 31.6+ KB


In [35]:
# split the three columns geners, categories and tags with a comma as a separator 
# to get a dictionary of all words in each column
genres = {}
categories = {}
tags = {}
for x in df['genres']:
    tmp = x.split(',')
    for y in tmp:
        if y in genres:
            genres[y] = genres[y] + 1
        else:
            genres[y] = 1

for x in df['categories']:
    tmp = x.split(',')
    for y in tmp:
        if y in categories:
            categories[y] = categories[y] + 1
        else:
            categories[y] = 1

for x in df['tags']:
    tmp = x.split(',')
    for y in tmp:
        if y in tags:
            tags[y] = tags[y] + 1
        else:
            tags[y] = 1

print("genres_num:%d" % (len(genres)))
print("categories_num:%d" % (len(categories)))
print("tags_num:%d" % (len(tags)))

genres_num:20
categories_num:29
tags_num:317


In [36]:
# use the dictionary obtained in the previous step to encode the three columns. 
# one-hot encoding is used, and words that occur less than 1/10 of the size of the dataset are directly discarded.
for key in genres:
    if(genres[key] < 45):
         continue
    tmp = []
    for x in df['genres']:
        if key in x:
            tmp.append(1)
        else:
            tmp.append(0)
    df[key] = tmp
df = df.drop(['genres'], axis = 1)
    
for key in categories:
    if(categories[key] < 45):
         continue
    tmp = []
    for x in df['categories']:
        if key in x:
            tmp.append(1)
        else:
            tmp.append(0)
    df[key] = tmp
df.drop(['categories'], axis = 1, inplace = True)

for key in tags:
    if(tags[key] < 45):
         continue
    tmp = []
    for x in df['tags']:
        if key in x:
            tmp.append(1)
        else:
            tmp.append(0)
    df[key] = tmp
df = df.drop(['tags'], axis = 1)

In [37]:
month = {'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}

In [38]:
# purchase date processing, directly divided into three columns of year, month and day
purchase_date_year = []
purchase_date_month = []
purchase_date_day = []
for x in df['purchase_date']:
        tmp = x.split(' ')
        purchase_date_month.append(month[tmp[0]])
        purchase_date_day.append(int(tmp[1][:-1]))
        purchase_date_year.append(int(tmp[2]))
df['purchase_date_year'] = purchase_date_year
df['purchase_date_month'] = purchase_date_month
df['purchase_date_day'] = purchase_date_day
df = df.drop(['purchase_date'], axis = 1)

In [39]:
# release date processing, directly divided into three columns of year, month and day
release_date_year = []
release_date_month = []
release_date_day = []
for x in df['release_date'][:train_size]:
    if x == "Nov 10, 2016":
        release_date_month.append(11)
        release_date_day.append(10)
        release_date_year.append(2016)
    else:
        tmp = x.split(' ')
        release_date_day.append(int(tmp[0]))
        release_date_month.append(month[tmp[1][:-1]])
        release_date_year.append(int(tmp[2]))
for x in df['release_date'][train_size:]:
        tmp = x.split('-')
        release_date_day.append(int(tmp[0]))
        release_date_month.append(month[tmp[1]])
        release_date_year.append(2000 + int(tmp[2]))
    
df['release_date_year'] = release_date_year
df['release_date_month'] = release_date_month
df['release_date_day'] = release_date_day
df = df.drop(['release_date'], axis = 1)

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 443 entries, 0 to 89
Data columns (total 61 columns):
is_free                       443 non-null bool
price                         443 non-null float64
total_positive_reviews        443 non-null float64
total_negative_reviews        443 non-null float64
Adventure                     443 non-null int64
Casual                        443 non-null int64
Indie                         443 non-null int64
RPG                           443 non-null int64
Action                        443 non-null int64
Strategy                      443 non-null int64
Simulation                    443 non-null int64
Single-player                 443 non-null int64
Steam Trading Cards           443 non-null int64
Steam Cloud                   443 non-null int64
Partial Controller Support    443 non-null int64
Full controller support       443 non-null int64
Multi-player                  443 non-null int64
Steam Achievements            443 non-null int64
Steam Wor

In [43]:
# training code
# use sklearn library's random forest algorithm to process regression task
x_train = df[:train_size]
x_test = df[train_size:]
from sklearn.ensemble import RandomForestRegressor
myrfr = RandomForestRegressor(
    n_estimators = 100, 
    bootstrap = True,
    oob_score = True,
    min_samples_leaf = 1,
    min_samples_split = 2
)
myrfr.fit(x_train, y_train)
joblib.dump(myrfr, 'myrfr.pkl')
y_test = myrfr.predict(x_test)
y_test = np.insert(y_test,12,0)
y_test = np.insert(y_test,45,0)
for i in range(0, len(y_test)):
    if(y_test[i] <= 0.5): y_test[i] = 0
    
print(y_test)

[ 0.          5.23116667  0.          0.          3.93066667  1.54316667
  0.          0.          1.05233333  5.80633333  0.          0.
  0.          1.786       3.00116667  0.          1.5895      1.73266667
  1.49083333  0.          0.97383333  1.57983333  2.27083333  0.68216667
  0.68433333  0.          0.          0.75966667  1.09283333  1.8195
  2.608      13.91366667  0.          0.78733333  4.83716667  0.535
  4.15383333  9.99633333  1.14266667  1.02666667  3.31333333  0.6445
  2.85333333  1.06433333  0.79916667  0.          0.5525      0.
  2.39533333 12.545       1.21166667  0.          1.63283333  0.
  0.79716667  0.          1.98016667  1.17516667  1.0705      1.62716667
  0.512       0.          0.         14.764       0.          0.
  0.6405      0.          0.74333333  0.56        0.9085     24.03483333
  1.79283333 34.09966667  0.54333333 42.05766667  4.25266667  2.14516667
  1.58766667  0.93433333  0.          1.1185      0.          8.8085
  8.67        0.5115      0

In [44]:
# if you want to use the saved model, use this code
myrfr = joblib.load('myrfr.pkl')
y_test = myrfr.predict(x_test)
y_test = np.insert(y_test,12,0)
y_test = np.insert(y_test,45,0)
for i in range(0, len(y_test)):
    if(y_test[i] <= 0.1): y_test[i] = 0
print(y_test)

[ 0.21316667  5.23116667  0.          0.2795      3.93066667  1.54316667
  0.20366667  0.14683333  1.05233333  5.80633333  0.32983333  0.37066667
  0.          1.786       3.00116667  0.4555      1.5895      1.73266667
  1.49083333  0.1295      0.97383333  1.57983333  2.27083333  0.68216667
  0.68433333  0.36        0.44933333  0.75966667  1.09283333  1.8195
  2.608      13.91366667  0.47566667  0.78733333  4.83716667  0.535
  4.15383333  9.99633333  1.14266667  1.02666667  3.31333333  0.6445
  2.85333333  1.06433333  0.79916667  0.          0.5525      0.40683333
  2.39533333 12.545       1.21166667  0.35283333  1.63283333  0.36933333
  0.79716667  0.451       1.98016667  1.17516667  1.0705      1.62716667
  0.512       0.11966667  0.22266667 14.764       0.131       0.38216667
  0.6405      0.49366667  0.74333333  0.56        0.9085     24.03483333
  1.79283333 34.09966667  0.54333333 42.05766667  4.25266667  2.14516667
  1.58766667  0.93433333  0.          1.1185      0.          8.

In [45]:
# save prediction results and output
y_test = pd.DataFrame(y_test)
y_test = y_test.reset_index()
y_test.columns = ['id', 'playtime_forever']
y_test.to_csv("mysubmission.csv", index = False)