In [154]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import seaborn as sns       
from scipy import stats
from sklearn.model_selection import train_test_split
import warnings

from sklearn.svm import SVC
from xgboost import  XGBClassifier

from sklearn.metrics import accuracy_score

warnings.filterwarnings('ignore')
%matplotlib inline  
# 为了在jupyter notebook里作图，需要用到这个命令

In [155]:
train = pd.read_csv("train.csv")

In [156]:
print (train.shape)

(7160, 184)


## 移除用不到的Column

In [157]:
train.drop("ID", axis = 1, inplace = True)
print (train.shape)

(7160, 183)


## 檢查Features datatype

In [158]:
print(train.dtypes.unique())
print(len(train.select_dtypes(include=['O']).columns))
print(train.select_dtypes(include=['O']).columns)
print(len(train.select_dtypes(include=['int64']).columns))
print(len(train.select_dtypes(include=['bool']).columns))
print(len(train.select_dtypes(include=['float64']).columns))

[dtype('O') dtype('int64') dtype('bool') dtype('float64')]
5
Index(['appearedTimeOfDay', 'city', 'continent', 'weather', 'weatherIcon'], dtype='object')
4
168
6


## 判斷是否有missing data

In [159]:
# Missing data in train
train_na = train.isnull().sum()
train_na = train_na[train_na>0]
train_na.sort_values(ascending=False)

Series([], dtype: int64)

## 判斷fetures是屬於categorical or numerical

In [160]:
# Differentiate numerical features (minus the target) and categorical features
categorical_features = train.select_dtypes(include=['object']).columns
print(categorical_features)
numerical_features = train.select_dtypes(exclude = ["object"]).columns
print(numerical_features)

print("Numerical features : " + str(len(numerical_features)))
print("Categorical features : " + str(len(categorical_features)))
feat_num = train[numerical_features]
feat_cat = train[categorical_features]

Index(['appearedTimeOfDay', 'city', 'continent', 'weather', 'weatherIcon'], dtype='object')
Index(['appearedHour', 'appearedMinute', 'terrainType', 'closeToWater',
       'temperature', 'windSpeed', 'pressure', 'population_density', 'urban',
       'suburban',
       ...
       'cooc_143', 'cooc_144', 'cooc_145', 'cooc_146', 'cooc_147', 'cooc_148',
       'cooc_149', 'cooc_150', 'cooc_151', 'class'],
      dtype='object', length=178)
Numerical features : 178
Categorical features : 5


In [161]:
train=feat_num
train, valid = train_test_split(train, test_size=0.2)

y_train=train['class']
x_train=train
x_train.drop("class", axis = 1, inplace = True)

y_valid=valid['class']
x_valid=valid
x_valid.drop("class", axis = 1, inplace = True)


x_test=test
print (x_train.shape)
print (y_train.shape)
print (x_valid.shape)
print (y_valid.shape)
print (x_test.shape)

(5728, 177)
(5728,)
(1432, 177)
(1432,)
(1791, 182)


In [162]:
model_svc=SVC()
model_svc.fit(x_train,y_train)
y_valid_pred_svc=model_svc.predict(x_valid)

print (y_valid_pred_svc)


[0 0 3 ... 3 2 2]


In [None]:
print ("svc:" ,accuracy_score(y_valid, y_valid_pred_svc))

In [167]:
model_xgboost = XGBClassifier(learning_rate=0.1)
model_xgboost.fit(x_train,y_train)
y_valid_pred_xgboost=model_xgboost.predict(x_valid)
print (y_valid_pred_xgboost)

[0 3 3 ... 0 2 5]


In [168]:
print ("xgboost:" ,accuracy_score(y_valid, y_valid_pred_xgboost))

svc: 0.5027932960893855
xgboost: 0.5551675977653632


In [169]:
x_test = pd.read_csv("test.csv")
x_test.head()

Unnamed: 0,id,appearedTimeOfDay,appearedHour,appearedMinute,terrainType,closeToWater,city,continent,weather,temperature,...,cooc_142,cooc_143,cooc_144,cooc_145,cooc_146,cooc_147,cooc_148,cooc_149,cooc_150,cooc_151
0,MTA5MTEwOTYxMzM0NzA2NDEzNzM=,morning,8,26,13,False,Ljubljana,Europe,MostlyCloudy,16.8,...,False,False,False,False,False,False,False,False,False,False
1,MTY0OTUyMTM2MDExMjg3MjczMjU=,night,2,35,13,True,Los_Angeles,America,PartlyCloudy,18.6,...,False,False,False,False,False,True,False,False,False,False
2,MTQ0ODU1OTEzOTU1NTAyNzI4NjE=,night,0,5,0,True,New_York,America,PartlyCloudy,30.0,...,False,False,False,False,False,False,False,False,False,False
3,MTY1ODg1MzEwNzYzNDUzMDUwNTM=,morning,7,38,13,True,London,Europe,MostlyCloudy,18.5,...,False,False,False,False,False,False,False,False,False,False
4,MTY0NTg2OTA5MTkzOTE2MTc1MTc=,night,1,27,12,False,Chicago,America,Clear,20.2,...,False,False,False,False,False,False,False,False,False,False


In [170]:
print (x_test.shape)
x_test_id=x_test["id"]
x_test.drop("id", axis = 1, inplace = True)
print (x_test.shape)

(1791, 183)
(1791, 182)


In [171]:
test_numerical_features = x_test.select_dtypes(exclude = ["object"]).columns
print(test_numerical_features)

print("Numerical features : " + str(len(test_numerical_features)))
test_feat_num = x_test[test_numerical_features]
x_test=test_feat_num

Index(['appearedHour', 'appearedMinute', 'terrainType', 'closeToWater',
       'temperature', 'windSpeed', 'pressure', 'population_density', 'urban',
       'suburban',
       ...
       'cooc_142', 'cooc_143', 'cooc_144', 'cooc_145', 'cooc_146', 'cooc_147',
       'cooc_148', 'cooc_149', 'cooc_150', 'cooc_151'],
      dtype='object', length=177)
Numerical features : 177


In [172]:
#y_test_pred=model_svc.predict(x_test)
y_test_pred=model_xgboost.predict(x_test)

In [173]:
prediction = pd.DataFrame(y_test_pred, columns=['class'])
result = pd.concat([x_test_id, prediction], axis=1)
# result = result.drop(resultlt.columns[0], 1)
#result.columns
result.head()

Unnamed: 0,id,class
0,MTA5MTEwOTYxMzM0NzA2NDEzNzM=,2
1,MTY0OTUyMTM2MDExMjg3MjczMjU=,1
2,MTQ0ODU1OTEzOTU1NTAyNzI4NjE=,3
3,MTY1ODg1MzEwNzYzNDUzMDUwNTM=,3
4,MTY0NTg2OTA5MTkzOTE2MTc1MTc=,5


In [174]:
result.to_csv('submission.csv', index=False)