In [3]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [21]:
class FillNan(object):
    def fit(self, X, y):
        return self
    def transform(self, X):
        X.fillna(value={'Sector': 'NA'}, inplace=True)
        return X

In [2]:
class CustomEncoder(object):
    encoders = {}
    def fit(self, X, y):
        for column in X.columns:
            if column not in type(self).encoders.keys():
                type(self).encoders[column] = LabelEncoder().fit(X[column])
        return self
    def transform(self, X):
        for column in X.columns:
            X[column] = type(self).encoders[column].transform(X[column])
        return X
    def get_encoders(self):
        return type(self).encoders

In [4]:
data = pd.read_csv('train_call_data_v1.csv')

In [5]:
data['Original Time Queued'] = pd.to_datetime(data['Original Time Queued'])

In [6]:
data['Quarter'] = data['Original Time Queued'].map(lambda x: int(x.hour/6))

In [7]:
data_tmp = data.drop(['Unnamed: 0', 'CAD Event Number', 'Arrived Time', 'Original Time Queued'], axis=1)

In [22]:
y = data_tmp.iloc[:,0]

In [23]:
X = data_tmp.iloc[:,1:]

In [24]:
X.isnull().any()

Call Type            False
Priority             False
Initial Call Type    False
Final Call Type      False
Precinct             False
Sector                True
Beat                 False
Quarter              False
dtype: bool

In [25]:
features_pipeline = make_pipeline(FillNan(), CustomEncoder())

In [26]:
X = features_pipeline.fit(X, y).transform(X)

In [28]:
X.head(10)

Unnamed: 0,Call Type,Priority,Initial Call Type,Final Call Type,Precinct,Sector,Beat,Quarter
0,6,2,182,281,5,13,58,0
1,4,6,141,245,5,7,36,0
2,6,1,85,42,1,11,51,0
3,6,1,17,112,2,12,52,0
4,4,2,174,42,5,7,37,0
5,0,3,123,42,1,6,32,0
6,6,1,150,254,2,12,52,0
7,6,2,176,103,5,2,12,0
8,6,2,130,192,5,13,60,0
9,4,6,175,101,2,14,64,0


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100, test_size=0.3)

In [30]:
model_pipeline = make_pipeline(StandardScaler(), RandomForestClassifier())

In [42]:
model_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [None]:
model_pipeline.trans

## Interpretando segundo arquivo

In [34]:
data2 = pd.read_csv('test_call_data.csv')

In [36]:
data2['Original Time Queued'] = pd.to_datetime(data2['Original Time Queued'])

In [37]:
data2['Quarter'] = data2['Original Time Queued'].map(lambda x: int(x.hour/6))

In [38]:
data2_tmp = data2.drop(['Unnamed: 0', 'CAD Event Number', 'Arrived Time', 'Original Time Queued'], axis=1)

In [39]:
y = data_tmp.iloc[:,0]

In [40]:
X = data_tmp.iloc[:,1:]

In [41]:
X = features_pipeline.fit(X, y).transform(X)