In [1]:
import pandas as pd
import numpy as np
import json
import sys
from sklearn.preprocessing import LabelEncoder   #把字串符號轉數字
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

In [2]:
with open('SMP/train_img.txt', 'r') as f:
    imgs = f.read().splitlines()

In [3]:
imageID = []
for i in range(len(imgs)):
    imageID.append(imgs[i].split('/')[-1])
imageID = pd.DataFrame(imageID)
imageID = imageID.rename(columns={0:'imageID'})

In [4]:
imageID.head()

Unnamed: 0,imageID
0,385070026
1,943750056
2,3246928439
3,3432316502
4,3538960847


In [5]:
label = pd.read_csv("SMP/train_label.txt", header = None) #處理沒有feature name的方式
label = label.rename(columns={0:'score'})
label.head()

Unnamed: 0,score
0,11.18
1,15.15
2,10.99
3,8.63
4,11.16


In [6]:
imageInformation = pd.read_csv("SMP/image_information.txt",skiprows=[764,767], header = None).astype('int64')
imageInformation = imageInformation.rename(columns={0:'imageID'})
imageInformation = imageInformation.rename(columns={1:'ViewCount'})
imageInformation = imageInformation.rename(columns={2:'FavoriteCount'})
imageInformation = imageInformation.rename(columns={3:'Message'})
imageInformation = imageInformation.reindex(columns=['imageID', 'ViewCount', 'FavoriteCount', 'Message']) #重設置index
imageInformation.head()

Unnamed: 0,imageID,ViewCount,FavoriteCount,Message
0,943750056,49403,31,32
1,3544891702,21131,41,7
2,3246928439,3455,8,1
3,3432316502,615,1,4
4,3538960847,3980,57,10


In [7]:
with open('SMP/train_tags.json', 'r') as f:
    tags = json.load(f)
tags = pd.DataFrame(tags)
#tags.head()

In [8]:
AllTags = []
for i in range(len(tags['Alltags'])):
    AllTags.append(len(tags['Alltags'][i].split()))

AllTags = pd.DataFrame(AllTags)
AllTags = AllTags.rename(columns={0:'TagsCount'})

In [9]:
AllTags.head()

Unnamed: 0,TagsCount
0,12
1,65
2,23
3,9
4,19


In [10]:
# with open('SMP/train_temporalspatial.json', 'r') as ff:
#     temporalspatial = json.load(ff)
# temporalspatial = pd.DataFrame(temporalspatial)
# temporalspatial.head()

In [11]:
with open('SMP/train_category.json', 'r') as ff:
    category = json.load(ff)
category = pd.DataFrame(category)
#category.head()

In [12]:
#把字串符號轉成數字
Uid = category["Uid"]
Uid = pd.DataFrame(Uid)
Uid["Uid"] = Uid["Uid"].apply(str)
Uid["Uid_code"] = LabelEncoder().fit_transform(Uid["Uid"])
Uid.drop(columns = ["Uid"],inplace=True)
Uid.head()

Unnamed: 0,Uid_code
0,31775
1,2876
2,14395
3,33828
4,1934


In [13]:
#把字串符號轉成數字
Concept = category["Concept"]
Concept = pd.DataFrame(Concept)
Concept["Concept"] = Concept["Concept"].apply(str)
Concept["Concept_code"] = LabelEncoder().fit_transform(Concept["Concept"])
Concept.drop(columns = ["Concept"],inplace=True)
Concept.head()

Unnamed: 0,Concept_code
0,254
1,200
2,105
3,426
4,602


In [14]:
# with open('SMP/train_additional.json', 'r') as ff:
#     additional = json.load(ff)
# additional = pd.DataFrame(additional)
# additional.head()

In [15]:
trainData = pd.concat( [imageID , Uid], axis=1 )   # axis=1是X軸，axis=0是y軸
trainData = pd.concat( [trainData , Concept], axis=1 )
trainData = pd.concat( [trainData , AllTags], axis=1 )
trainData = pd.concat( [trainData , label], axis=1 )
trainData.head()

Unnamed: 0,imageID,Uid_code,Concept_code,TagsCount,score
0,385070026,31775,254,12,11.18
1,943750056,2876,200,65,15.15
2,3246928439,14395,105,23,10.99
3,3432316502,33828,426,9,8.63
4,3538960847,1934,602,19,11.16


In [16]:
trainData["imageID"] = trainData["imageID"].apply(int)
trainData = pd.merge(imageInformation, trainData, how='left', on='imageID')
trainData = trainData[['imageID','Uid_code','Concept_code','ViewCount','FavoriteCount','Message','TagsCount','score']]
trainData.head()

Unnamed: 0,imageID,Uid_code,Concept_code,ViewCount,FavoriteCount,Message,TagsCount,score
0,943750056,2876.0,200.0,49403,31,32,65.0,15.15
1,3544891702,1934.0,212.0,21131,41,7,37.0,14.24
2,3246928439,14395.0,105.0,3455,8,1,23.0,10.99
3,3432316502,33828.0,426.0,615,1,4,9.0,8.63
4,3538960847,1934.0,602.0,3980,57,10,19.0,11.16


In [17]:
trainData.shape

(273119, 8)

In [18]:
trainData.dropna(axis=0,inplace=True)   # axis=1是X軸，axis=0是y軸

In [19]:
trainData.shape

(273113, 8)

In [20]:
trainData.isnull().any()

imageID          False
Uid_code         False
Concept_code     False
ViewCount        False
FavoriteCount    False
Message          False
TagsCount        False
score            False
dtype: bool

In [21]:
temp , X_test= train_test_split(trainData, test_size=0.2)
X_train , X_valid = train_test_split(temp,test_size=0.1)
Y_train = X_train["score"]
X_train = X_train.drop(["score"],axis = 1)
Y_test = X_test["score"]
X_test = X_test.drop(["score"],axis = 1)
Y_valid = X_valid["score"]
X_valid = X_valid.drop(["score"],axis = 1)

In [22]:
model = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)

model.fit(
    X_train, 
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
    verbose=True, 
    early_stopping_rounds = 15)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


[0]	validation_0-rmse:5.8058	validation_1-rmse:5.81522
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 15 rounds.
[1]	validation_0-rmse:5.23497	validation_1-rmse:5.24434
[2]	validation_0-rmse:4.72192	validation_1-rmse:4.7313
[3]	validation_0-rmse:4.26309	validation_1-rmse:4.27238
[4]	validation_0-rmse:3.8514	validation_1-rmse:3.86061
[5]	validation_0-rmse:3.48234	validation_1-rmse:3.49174
[6]	validation_0-rmse:3.15166	validation_1-rmse:3.16096
[7]	validation_0-rmse:2.85239	validation_1-rmse:2.86196
[8]	validation_0-rmse:2.60273	validation_1-rmse:2.61316
[9]	validation_0-rmse:2.36361	validation_1-rmse:2.37419
[10]	validation_0-rmse:2.15036	validation_1-rmse:2.16132
[11]	validation_0-rmse:1.95713	validation_1-rmse:1.96884
[12]	validation_0-rmse:1.78506	validation_1-rmse:1.79724
[13]	validation_0-rmse:1.63231	validation_1-rmse:1.64504
[14]	validation_0-rmse:1.49923	validation_1-rmse:1.5125


[137]	validation_0-rmse:0.571251	validation_1-rmse:0.601346
[138]	validation_0-rmse:0.570629	validation_1-rmse:0.600861
[139]	validation_0-rmse:0.57053	validation_1-rmse:0.600768
[140]	validation_0-rmse:0.570394	validation_1-rmse:0.600677
[141]	validation_0-rmse:0.570264	validation_1-rmse:0.600612
[142]	validation_0-rmse:0.570079	validation_1-rmse:0.600483
[143]	validation_0-rmse:0.569777	validation_1-rmse:0.600255
[144]	validation_0-rmse:0.569632	validation_1-rmse:0.600096
[145]	validation_0-rmse:0.569238	validation_1-rmse:0.599822
[146]	validation_0-rmse:0.568679	validation_1-rmse:0.599347
[147]	validation_0-rmse:0.568543	validation_1-rmse:0.599227
[148]	validation_0-rmse:0.568099	validation_1-rmse:0.598753
[149]	validation_0-rmse:0.567836	validation_1-rmse:0.598504
[150]	validation_0-rmse:0.567646	validation_1-rmse:0.598389
[151]	validation_0-rmse:0.567617	validation_1-rmse:0.598344
[152]	validation_0-rmse:0.567517	validation_1-rmse:0.598248
[153]	validation_0-rmse:0.567313	validati

[275]	validation_0-rmse:0.548585	validation_1-rmse:0.582891
[276]	validation_0-rmse:0.548459	validation_1-rmse:0.582818
[277]	validation_0-rmse:0.548402	validation_1-rmse:0.58277
[278]	validation_0-rmse:0.548178	validation_1-rmse:0.582631
[279]	validation_0-rmse:0.548135	validation_1-rmse:0.582612
[280]	validation_0-rmse:0.548084	validation_1-rmse:0.582568
[281]	validation_0-rmse:0.548028	validation_1-rmse:0.582508
[282]	validation_0-rmse:0.547938	validation_1-rmse:0.582448
[283]	validation_0-rmse:0.547814	validation_1-rmse:0.582336
[284]	validation_0-rmse:0.547724	validation_1-rmse:0.582309
[285]	validation_0-rmse:0.547675	validation_1-rmse:0.582277
[286]	validation_0-rmse:0.547655	validation_1-rmse:0.582258
[287]	validation_0-rmse:0.547609	validation_1-rmse:0.582244
[288]	validation_0-rmse:0.547428	validation_1-rmse:0.582122
[289]	validation_0-rmse:0.547411	validation_1-rmse:0.582116
[290]	validation_0-rmse:0.547147	validation_1-rmse:0.581934
[291]	validation_0-rmse:0.547022	validati

[413]	validation_0-rmse:0.53465	validation_1-rmse:0.572935
[414]	validation_0-rmse:0.534406	validation_1-rmse:0.572739
[415]	validation_0-rmse:0.534352	validation_1-rmse:0.572716
[416]	validation_0-rmse:0.534268	validation_1-rmse:0.572667
[417]	validation_0-rmse:0.534256	validation_1-rmse:0.572674
[418]	validation_0-rmse:0.534181	validation_1-rmse:0.572636
[419]	validation_0-rmse:0.534152	validation_1-rmse:0.572608
[420]	validation_0-rmse:0.534086	validation_1-rmse:0.572539
[421]	validation_0-rmse:0.533981	validation_1-rmse:0.572421
[422]	validation_0-rmse:0.53394	validation_1-rmse:0.572379
[423]	validation_0-rmse:0.533816	validation_1-rmse:0.572306
[424]	validation_0-rmse:0.533796	validation_1-rmse:0.57227
[425]	validation_0-rmse:0.533723	validation_1-rmse:0.572174
[426]	validation_0-rmse:0.53366	validation_1-rmse:0.572116
[427]	validation_0-rmse:0.533644	validation_1-rmse:0.572124
[428]	validation_0-rmse:0.533555	validation_1-rmse:0.572046
[429]	validation_0-rmse:0.533522	validation_

[551]	validation_0-rmse:0.52417	validation_1-rmse:0.565741
[552]	validation_0-rmse:0.52407	validation_1-rmse:0.565674
[553]	validation_0-rmse:0.524042	validation_1-rmse:0.565633
[554]	validation_0-rmse:0.523949	validation_1-rmse:0.565573
[555]	validation_0-rmse:0.523926	validation_1-rmse:0.565561
[556]	validation_0-rmse:0.523878	validation_1-rmse:0.565524
[557]	validation_0-rmse:0.523781	validation_1-rmse:0.565449
[558]	validation_0-rmse:0.523673	validation_1-rmse:0.565377
[559]	validation_0-rmse:0.523661	validation_1-rmse:0.565378
[560]	validation_0-rmse:0.523558	validation_1-rmse:0.565297
[561]	validation_0-rmse:0.523538	validation_1-rmse:0.565282
[562]	validation_0-rmse:0.523468	validation_1-rmse:0.56523
[563]	validation_0-rmse:0.523446	validation_1-rmse:0.565237
[564]	validation_0-rmse:0.523429	validation_1-rmse:0.565231
[565]	validation_0-rmse:0.523398	validation_1-rmse:0.56521
[566]	validation_0-rmse:0.523336	validation_1-rmse:0.5652
[567]	validation_0-rmse:0.523212	validation_1-

[689]	validation_0-rmse:0.515385	validation_1-rmse:0.559932
[690]	validation_0-rmse:0.515373	validation_1-rmse:0.55992
[691]	validation_0-rmse:0.515354	validation_1-rmse:0.559925
[692]	validation_0-rmse:0.515305	validation_1-rmse:0.559878
[693]	validation_0-rmse:0.515146	validation_1-rmse:0.55976
[694]	validation_0-rmse:0.515034	validation_1-rmse:0.559714
[695]	validation_0-rmse:0.515018	validation_1-rmse:0.559748
[696]	validation_0-rmse:0.514995	validation_1-rmse:0.559732
[697]	validation_0-rmse:0.514979	validation_1-rmse:0.559734
[698]	validation_0-rmse:0.514948	validation_1-rmse:0.559722
[699]	validation_0-rmse:0.514925	validation_1-rmse:0.559711
[700]	validation_0-rmse:0.514919	validation_1-rmse:0.559706
[701]	validation_0-rmse:0.51487	validation_1-rmse:0.559708
[702]	validation_0-rmse:0.514842	validation_1-rmse:0.559667
[703]	validation_0-rmse:0.514705	validation_1-rmse:0.559585
[704]	validation_0-rmse:0.514665	validation_1-rmse:0.559553
[705]	validation_0-rmse:0.514634	validation

[826]	validation_0-rmse:0.508237	validation_1-rmse:0.555625
[827]	validation_0-rmse:0.508193	validation_1-rmse:0.555593
[828]	validation_0-rmse:0.508184	validation_1-rmse:0.555582
[829]	validation_0-rmse:0.508103	validation_1-rmse:0.555521
[830]	validation_0-rmse:0.508079	validation_1-rmse:0.55548
[831]	validation_0-rmse:0.508065	validation_1-rmse:0.555473
[832]	validation_0-rmse:0.508041	validation_1-rmse:0.555432
[833]	validation_0-rmse:0.508033	validation_1-rmse:0.555427
[834]	validation_0-rmse:0.508003	validation_1-rmse:0.555407
[835]	validation_0-rmse:0.50799	validation_1-rmse:0.555385
[836]	validation_0-rmse:0.507927	validation_1-rmse:0.555347
[837]	validation_0-rmse:0.507899	validation_1-rmse:0.555379
[838]	validation_0-rmse:0.507878	validation_1-rmse:0.555386
[839]	validation_0-rmse:0.507784	validation_1-rmse:0.555287
[840]	validation_0-rmse:0.507758	validation_1-rmse:0.555285
[841]	validation_0-rmse:0.507731	validation_1-rmse:0.55527
[842]	validation_0-rmse:0.507662	validation

[964]	validation_0-rmse:0.501647	validation_1-rmse:0.55151
[965]	validation_0-rmse:0.501548	validation_1-rmse:0.551421
[966]	validation_0-rmse:0.501493	validation_1-rmse:0.551371
[967]	validation_0-rmse:0.501479	validation_1-rmse:0.551368
[968]	validation_0-rmse:0.501464	validation_1-rmse:0.551332
[969]	validation_0-rmse:0.501394	validation_1-rmse:0.55131
[970]	validation_0-rmse:0.501389	validation_1-rmse:0.551305
[971]	validation_0-rmse:0.501377	validation_1-rmse:0.551297
[972]	validation_0-rmse:0.501327	validation_1-rmse:0.551296
[973]	validation_0-rmse:0.501298	validation_1-rmse:0.551278
[974]	validation_0-rmse:0.501296	validation_1-rmse:0.551267
[975]	validation_0-rmse:0.501286	validation_1-rmse:0.551233
[976]	validation_0-rmse:0.501234	validation_1-rmse:0.551203
[977]	validation_0-rmse:0.501209	validation_1-rmse:0.55117
[978]	validation_0-rmse:0.501158	validation_1-rmse:0.551148
[979]	validation_0-rmse:0.501125	validation_1-rmse:0.551119
[980]	validation_0-rmse:0.50108	validation_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, eta=0.3, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=8,
       min_child_weight=300, missing=None, n_estimators=1000, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=42, silent=True,
       subsample=0.8)

In [23]:
Y_pred = model.predict(X_test).clip(0, 20)
print(Y_pred)

[ 8.451891   3.9385023  3.320075  ...  5.4039326  6.5804296 11.643914 ]


In [24]:
error = []
for i in range(len(Y_test)):
    error.append(Y_test.values[i] - Y_pred[i])
    
#print("Errors: ", error)
#print(error)
squaredError = []
absError = []
for val in error:
    squaredError.append(val * val)#平方
    absError.append(abs(val))#誤差絕對值
    
#print("Square Error: ", squaredError)
#print("Absolute Value of Error: ", absError)
print("MSE = ", sum(squaredError) / len(squaredError))#平均平方誤差MSE


#from math import sqrt
#print("RMSE = ", sqrt(sum(squaredError) / len(squaredError)))#平均平方根誤差RMSE
print("MAE = ", sum(absError) / len(absError))#平均絕對誤差MAE

MSE =  0.2862616734108291
MAE =  0.34321381370899495
