In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import warnings;
warnings.simplefilter('ignore')

In [2]:
%matplotlib inline 

import numpy as np   
import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
##Reading the data from .csv file###
df_rail= pd.read_csv('rail_data.csv',header=0,encoding = 'unicode_escape')

In [4]:
df_rail.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4751 entries, 0 to 4750
Data columns (total 30 columns):
Date                       4751 non-null object
SubDivision                4751 non-null object
GangId                     4751 non-null object
OrderNumber                0 non-null float64
OperationNumber            4751 non-null int64
TrackId                    4751 non-null object
MegaWorkBlock              4751 non-null object
ShadowWorkBlock            4751 non-null object
SplitWorkBlock             4751 non-null object
WBMileFrom                 4751 non-null float64
WBMileTo                   4751 non-null float64
AnchorPattern              4751 non-null object
SpikePattern               4751 non-null object
CwrTerritory               4751 non-null object
DestressingMethod          4751 non-null object
ClosureType                0 non-null float64
NumberofInsulatedJoints    4751 non-null int64
NumberofTransitionRails    4751 non-null int64
NumberofCompromiseRails    4751 no

In [5]:
df_rail.isnull().any()

Date                       False
SubDivision                False
GangId                     False
OrderNumber                 True
OperationNumber            False
TrackId                    False
MegaWorkBlock              False
ShadowWorkBlock            False
SplitWorkBlock             False
WBMileFrom                 False
WBMileTo                   False
AnchorPattern              False
SpikePattern               False
CwrTerritory               False
DestressingMethod          False
ClosureType                 True
NumberofInsulatedJoints    False
NumberofTransitionRails    False
NumberofCompromiseRails    False
FastenerTypeTie             True
PlateChangeOutRequired     False
RailType                   False
TravelTimeDuringBlocks     False
ActualStartTime             True
ActualEndTime               True
NumberofTampers             True
ExpectedOutput             False
ConfidenceLevel            False
Version                    False
workBlockPlannedMinutes    False
dtype: boo

In [6]:
df_rail[df_rail.select_dtypes(['object']).columns] = df_rail.select_dtypes(['object']).apply(lambda x: x.astype('category'))

In [7]:
# **************Cols to be removed*********************************** #
# 
#Date-  Has no value in it
#OrderNumber- identifier will not have impact in prediction
#NumberofTampers- Has all Null value
# ******************************************************************** #

In [8]:
df_rail_drop= df_rail.drop(['Date', 'OrderNumber'], axis=1)

In [9]:
df_rail_drop['miles']= abs(df_rail_drop['WBMileFrom'] - df_rail_drop['WBMileTo'])

In [10]:
#Label Encode GangId categorical column
from sklearn import preprocessing

encode = preprocessing.LabelEncoder()
df_rail_drop['GangId']=encode.fit_transform(df_rail_drop['GangId'])
keys = encode.classes_
values = encode.transform(encode.classes_)
GangId_dictionary = dict(zip(keys, values))
print(GangId_dictionary)

{'S70P0G02': 0, 'S70P0R01': 1, 'S70P0R03': 2, 'S70P0R04': 3, 'S70P0R05': 4, 'S70P0W13': 5, 'S70P0W15': 6, 'S80P0R02': 7, 'S80P0R03': 8, 'S80P0R04': 9, 'W40P0G01': 10, 'W40P0M14': 11, 'W40P0R01': 12, 'W40P0R02': 13, 'W40P0R03': 14, 'W40P0R04': 15, 'W40P0R05': 16, 'W40P0R06': 17, 'W40P0W04': 18, 'W40P0W08': 19, 'W40P0W09': 20, 'W40P0W11': 21, 'W40P0W12': 22, 'W40P0W14': 23, 'W40P0W16': 24, 'W40P0W17': 25, 'W40P0W20': 26, 'W50P0M05': 27, 'W50P0M06': 28, 'W50P0R01': 29, 'W50P0R02': 30, 'W50P0R03': 31, 'W50P0R04': 32, 'W50P0R05': 33, 'W50P0R06': 34, 'W50P0R07': 35, 'W50P0R08': 36, 'W50P0W01': 37, 'W50P0W05': 38, 'W50P0W07': 39, 'W50P0W11': 40, 'W50P0W13': 41, 'W50P0W15': 42, 'W50P0W16': 43, 'W50P0W17': 44, 'W50P0W18': 45, 'W50P0W20': 46}


In [11]:
df_rail_drop['TrackId']=encode.fit_transform(df_rail_drop['TrackId'])
keys = encode.classes_
values = encode.transform(encode.classes_)
TrackID_dictionary = dict(zip(keys, values))
print(TrackID_dictionary)

{'01S': 0, '02S': 1, '03S': 2, '04S': 3, '1': 4, '2': 5, '3': 6, '6625': 7, 'BP02': 8, 'CSL1': 9, 'DE48': 10, 'DF22': 11, 'DF24': 12, 'DF40': 13, 'DF45': 14, 'DF48': 15, 'DF51': 16, 'DX1': 17, 'DX2': 18, 'E01': 19, 'EL00': 20, 'ER8': 21, 'ER9': 22, 'FB14': 23, 'FB40': 24, 'FB50': 25, 'FB51': 26, 'FB55': 27, 'FF20': 28, 'FF63': 29, 'FG13': 30, 'H040': 31, 'HA06': 32, 'HA08': 33, 'HA14': 34, 'HA26': 35, 'HA34': 36, 'IS99': 37, 'LD09': 38, 'LD11': 39, 'LX4': 40, 'NSLW': 41, 'NT01': 42, 'PA05': 43, 'PC30': 44, 'RB13': 45, 'RTW': 46, 'RX1': 47, 'RZ41': 48, 'SF67': 49, 'SL01': 50, 'SL02': 51, 'SO90': 52, 'UA41': 53, 'W03': 54, 'WR4': 55, 'WS01': 56, 'X2': 57, 'X3': 58, 'YB08': 59, 'YB13': 60, 'YF11': 61}


In [12]:
# MegaWorkBlock has 3 values, Yes, No and NaN. We can change Nan to No
# Convert NaN to NO
df_rail_drop['MegaWorkBlock'].fillna('No', inplace=True)

In [13]:
df_rail_drop['MegaWorkBlock']=encode.fit_transform(df_rail_drop['MegaWorkBlock'])
keys = encode.classes_
values = encode.transform(encode.classes_)
MegaWorkBlock_dictionary = dict(zip(keys, values))
print(MegaWorkBlock_dictionary)

{'No': 0, 'Yes': 1}


In [14]:
# ShadowWorkBlock has 3 values, Yes, No and NaN. We can change Nan to No
# Convert NaN to NO
df_rail_drop['ShadowWorkBlock'].fillna('No', inplace=True)

In [15]:
df_rail_drop['ShadowWorkBlock']=encode.fit_transform(df_rail_drop['ShadowWorkBlock'])
keys = encode.classes_
values = encode.transform(encode.classes_)
ShadowWorkBlock_dictionary = dict(zip(keys, values))
print(ShadowWorkBlock_dictionary)

{'No': 0, 'Yes': 1}


In [16]:
#df_rail_drop['SplitWorkBlock'].fillna('No', inplace=True)
df_rail_drop['SplitWorkBlock']=encode.fit_transform(df_rail_drop['SplitWorkBlock'])
keys = encode.classes_
values = encode.transform(encode.classes_)
SplitWorkBlock_dictionary = dict(zip(keys, values))
print(SplitWorkBlock_dictionary)

{'Yes': 0}


In [17]:
df_rail_drop['AnchorPattern']=encode.fit_transform(df_rail_drop['AnchorPattern'])
keys = encode.classes_
values = encode.transform(encode.classes_)
AnchorPattern_dictionary = dict(zip(keys, values))
print(AnchorPattern_dictionary)

{'Pattern 10': 0}


In [18]:
df_rail_drop['SpikePattern']=encode.fit_transform(df_rail_drop['SpikePattern'])
keys = encode.classes_
values = encode.transform(encode.classes_)
SpikePattern_dictionary = dict(zip(keys, values))
print(SpikePattern_dictionary)

{'A': 0, 'B': 1, 'C': 2, 'D': 3}


In [19]:
df_rail_drop.drop(['NumberofTampers','ActualStartTime','ActualEndTime'], axis=1,inplace=True )

In [20]:
df_rail_drop['NumberofCompromiseRails']=encode.fit_transform(df_rail_drop['NumberofCompromiseRails'])
keys = encode.classes_
values = encode.transform(encode.classes_)
NumberofCompromiseRails_dictionary = dict(zip(keys, values))
print(SpikePattern_dictionary)

{'A': 0, 'B': 1, 'C': 2, 'D': 3}


In [21]:
df_rail_drop['RailType']=encode.fit_transform(df_rail_drop['RailType'])
keys = encode.classes_
values = encode.transform(encode.classes_)
RailType_dictionary = dict(zip(keys, values))
print(SpikePattern_dictionary)

{'A': 0, 'B': 1, 'C': 2, 'D': 3}


In [22]:
df_rail_drop['SubDivision']=encode.fit_transform(df_rail_drop['SubDivision'])
keys = encode.classes_
values = encode.transform(encode.classes_)
SubDivision_dictionary = dict(zip(keys, values))
print(SpikePattern_dictionary)

{'A': 0, 'B': 1, 'C': 2, 'D': 3}


In [23]:
df_rail_drop.dtypes

SubDivision                   int32
GangId                        int32
OperationNumber               int64
TrackId                       int32
MegaWorkBlock                 int32
ShadowWorkBlock               int32
SplitWorkBlock                int32
WBMileFrom                  float64
WBMileTo                    float64
AnchorPattern                 int32
SpikePattern                  int32
CwrTerritory               category
DestressingMethod          category
ClosureType                 float64
NumberofInsulatedJoints       int64
NumberofTransitionRails       int64
NumberofCompromiseRails       int64
FastenerTypeTie            category
PlateChangeOutRequired     category
RailType                      int32
TravelTimeDuringBlocks      float64
ExpectedOutput                int64
ConfidenceLevel               int64
Version                       int64
workBlockPlannedMinutes       int64
miles                       float64
dtype: object

In [24]:
df_rail_drop['FastenerTypeTie']=df_rail_drop['FastenerTypeTie'].cat.add_categories('None')
df_rail_drop['FastenerTypeTie'].fillna('None', inplace=True)

In [25]:
df_rail_drop['FastenerTypeTie']=encode.fit_transform(df_rail_drop['FastenerTypeTie'])
keys = encode.classes_
values = encode.transform(encode.classes_)
FastenerTypeTie_dictionary = dict(zip(keys, values))
print(FastenerTypeTie_dictionary)

{'CLIP_RAIL': 0, 'None': 1}


In [26]:
df_rail_drop['PlateChangeOutRequired']=df_rail_drop['PlateChangeOutRequired'].cat.add_categories('None')
df_rail_drop['PlateChangeOutRequired'].fillna('None', inplace=True)

In [27]:
df_rail_drop['PlateChangeOutRequired']=encode.fit_transform(df_rail_drop['PlateChangeOutRequired'])
keys = encode.classes_
values = encode.transform(encode.classes_)
PlateChangeOutRequired_dictionary = dict(zip(keys, values))
print(PlateChangeOutRequired_dictionary)

{'No': 0, 'Yes': 1}


In [28]:
df_rail_drop['CwrTerritory']=encode.fit_transform(df_rail_drop['CwrTerritory'])
keys = encode.classes_
values = encode.transform(encode.classes_)
CwrTerritory_dictionary = dict(zip(keys, values))
print(PlateChangeOutRequired_dictionary)

{'No': 0, 'Yes': 1}


In [29]:
df_rail_drop['DestressingMethod']=encode.fit_transform(df_rail_drop['DestressingMethod'])
keys = encode.classes_
values = encode.transform(encode.classes_)
DestressingMethod_dictionary = dict(zip(keys, values))
print(PlateChangeOutRequired_dictionary)

{'No': 0, 'Yes': 1}


In [30]:
df_rail_drop['ClosureType'].fillna('No', inplace=True)

In [31]:
df_rail_drop['ClosureType']=encode.fit_transform(df_rail_drop['ClosureType'])
keys = encode.classes_
values = encode.transform(encode.classes_)
ClosureType_dictionary = dict(zip(keys, values))
print(PlateChangeOutRequired_dictionary)

{'No': 0, 'Yes': 1}


In [32]:
# store the normalized features data into np array
X = np.array(df_rail_drop)

In [33]:
len(df_rail_drop.columns)

26

In [34]:
X = df_rail_drop.drop('ExpectedOutput', axis=1)
Y= df_rail_drop.pop('ExpectedOutput')

In [35]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size=0.3 , random_state=1)

In [36]:
len(X_train.columns)

25

In [37]:
len(X_test.columns)

25

In [38]:
X_test.head()

Unnamed: 0,SubDivision,GangId,OperationNumber,TrackId,MegaWorkBlock,ShadowWorkBlock,SplitWorkBlock,WBMileFrom,WBMileTo,AnchorPattern,...,NumberofTransitionRails,NumberofCompromiseRails,FastenerTypeTie,PlateChangeOutRequired,RailType,TravelTimeDuringBlocks,ConfidenceLevel,Version,workBlockPlannedMinutes,miles
1772,1,34,310,4,1,1,0,55.310001,55.599998,0,...,1,0,1,0,0,40.0,72,1,360,0.289997
206,58,27,310,3,0,0,0,0.2,0.47,0,...,0,1,1,0,0,40.0,72,1,240,0.27
544,5,30,310,4,0,0,0,4.98,10.57,0,...,0,1,1,1,0,90.0,72,1,480,5.59
1140,1,30,310,4,1,1,0,124.959999,127.669998,0,...,2,0,0,0,0,0.0,72,1,360,2.709999
4272,44,12,310,4,0,0,0,59.93,60.68,0,...,0,0,1,1,0,0.0,72,1,240,0.75


In [39]:
from sklearn.tree import DecisionTreeRegressor

Model_dt = DecisionTreeRegressor(criterion ='mse')
Model_dt.fit(X_train, Y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [40]:
Model_dt.score(X_test, Y_test)

0.9806392147877304

In [41]:
from sklearn.ensemble import RandomForestRegressor
Model_RF = RandomForestRegressor(n_estimators=13, max_depth=15, random_state = 0)
Model_RF.fit(X_train, Y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=15,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=13,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [42]:
Model_RF.score(X_test, Y_test)

0.9828741182688525

In [43]:
###Ensembling
from sklearn.ensemble import AdaBoostRegressor
from sklearn import metrics

In [44]:
ada_b= AdaBoostRegressor(Model_RF,learning_rate=0.5, loss='square', n_estimators=10)
ada_b= ada_b.fit(X_train, Y_train)
y_predict = ada_b.predict(X_test)
print(ada_b.score(X_test , Y_test))
#print(metrics.confusion_matrix(Y_test, y_predict))

0.9854362295455947


In [45]:
from sklearn.ensemble import BaggingRegressor
bagg = BaggingRegressor(base_estimator=Model_RF, n_estimators=15,)
bagg = bagg.fit(X_train, Y_train)
y_predict = bagg.predict(X_test)

print(bagg.score(X_test , Y_test))

#print(metrics.confusion_matrix(Y_test, y_predict))

0.9801663379970931


In [46]:
#Gradient Boosting
from sklearn.ensemble import GradientBoostingRegressor
grad = GradientBoostingRegressor(loss='quantile', n_estimators=1)
grad = grad.fit(X_train, Y_train)
y_predict = grad.predict(X_test)

print(grad.score(X_test , Y_test))

#print(metrics.confusion_matrix(Y_test, y_predict))

-1.862293463015333


In [47]:
import pickle

In [48]:
filename = 'finalized_model.sav'
pickle.dump(Model_RF, open(filename, 'wb'))

In [51]:
import csv

In [54]:
w = csv.writer(open("..\..\..\Dictionary\RandomForest\Version1\GangId_dictionary.csv", "w"))
for key, val in GangId_dictionary.items():
    w.writerow([key, val])

12

12

12

12

12

12

12

12

12

12

13

13

13

13

13

13

13

13

13

13

13

13

13

13

13

13

13

13

13

13

13

13

13

13

13

13

13

13

13

13

13

13

13

13

13

13

13

In [55]:
w = csv.writer(open("..\..\..\Dictionary\RandomForest\Version1\TrackID_dictionary.csv", "w"))
for key, val in TrackID_dictionary.items():
    w.writerow([key, val])

7

7

7

7

5

5

5

8

8

8

9

9

9

9

9

9

9

8

8

8

9

8

8

9

9

9

9

9

9

9

9

9

9

9

9

9

9

9

9

9

8

9

9

9

9

9

8

8

9

9

9

9

9

9

8

8

9

7

7

9

9

9

In [56]:
w = csv.writer(open("..\..\..\Dictionary\RandomForest\Version1\MegaWorkBlock_dictionary.csv", "w"))
for key, val in MegaWorkBlock_dictionary.items():
    w.writerow([key, val])

6

7

In [57]:
w = csv.writer(open("..\..\..\Dictionary\RandomForest\Version1\ShadowWorkBlock_dictionary.csv", "w"))
for key, val in ShadowWorkBlock_dictionary.items():
    w.writerow([key, val])

6

7

In [58]:
w = csv.writer(open("..\..\..\Dictionary\RandomForest\Version1\SplitWorkBlock_dictionary.csv", "w"))
for key, val in SplitWorkBlock_dictionary.items():
    w.writerow([key, val])

7

In [59]:
w = csv.writer(open("..\..\..\Dictionary\RandomForest\Version1\AnchorPattern_dictionary.csv", "w"))
for key, val in AnchorPattern_dictionary.items():
    w.writerow([key, val])

14

In [60]:
w = csv.writer(open("..\..\..\Dictionary\RandomForest\Version1\SpikePattern_dictionary.csv", "w"))
for key, val in SpikePattern_dictionary.items():
    w.writerow([key, val])

5

5

5

5

In [74]:
w = csv.writer(encoding = 'unicode_escape', open("..\..\..\Dictionary\RandomForest\Version1\NumberofCompromiseRails_dictionary.csv", "w"))
for key, val in NumberofCompromiseRails_dictionary.items():
    w.writerow([key, val])

SyntaxError: positional argument follows keyword argument (<ipython-input-74-85cb37674b06>, line 1)

In [62]:
w = csv.writer(open("..\..\..\Dictionary\RandomForest\Version1\RailType_dictionary.csv", "w"))
for key, val in RailType_dictionary.items():
    w.writerow([key, val])

7

In [63]:
w = csv.writer(open("..\..\..\Dictionary\RandomForest\Version1\SubDivision_dictionary.csv", "w"))
for key, val in SubDivision_dictionary.items():
    w.writerow([key, val])

12

11

12

15

13

11

11

11

12

14

13

17

10

14

17

16

11

14

17

15

12

17

13

13

10

14

10

18

12

11

20

13

16

16

18

14

10

13

11

11

13

12

11

15

12

13

11

13

11

15

16

9

15

15

11

12

13

13

9

12

In [64]:
w = csv.writer(open("..\..\..\Dictionary\RandomForest\Version1\FastenerTypeTie_dictionary.csv", "w"))
for key, val in FastenerTypeTie_dictionary.items():
    w.writerow([key, val])

13

8

In [65]:
w = csv.writer(open("..\..\..\Dictionary\RandomForest\Version1\PlateChangeOutRequired_dictionary.csv", "w"))
for key, val in PlateChangeOutRequired_dictionary.items():
    w.writerow([key, val])

6

7

In [66]:
w = csv.writer(open("..\..\..\Dictionary\RandomForest\Version1\CwrTerritory_dictionary.csv", "w"))
for key, val in CwrTerritory_dictionary.items():
    w.writerow([key, val])

6

7

In [67]:
w = csv.writer(open("..\..\..\Dictionary\RandomForest\Version1\DestressingMethod_dictionary.csv", "w"))
for key, val in DestressingMethod_dictionary.items():
    w.writerow([key, val])

11

17

8

16

11

In [68]:
w = csv.writer(open("..\..\..\Dictionary\RandomForest\Version1\ClosureType_dictionary.csv", "w"))
for key, val in ClosureType_dictionary.items():
    w.writerow([key, val])

6

In [70]:
w = csv.writer(open("..\..\..\Dictionary\RandomForest\Version1\Dummy.csv", "w"))
for key, val in ClosureType_dictionary.items():
    w.writerow([key, val])

6