In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn import metrics

In [2]:
# Importing all Datasets
races_df = pd.read_csv("races.csv", header=[0], encoding="ISO-8859-1")
qualifying_df = pd.read_csv("qualifying.csv", header=[0], encoding="ISO-8859-1")
driver_standings_df = pd.read_csv("driver_standings.csv", header=[0], encoding="ISO-8859-1")
constructor_standings_df = pd.read_csv("constructor_standings.csv", header=[0], encoding="ISO-8859-1")
driver_results_df = pd.read_csv("results.csv", header=[0], encoding="ISO-8859-1")
constructor_results_df = pd.read_csv("constructor_results.csv", header=[0], encoding="ISO-8859-1")

In [3]:
races_df.sort_values(by=['Year', 'Round'], inplace=True)
races_df.reset_index(drop=True, inplace=True)
races_df['RaceIndexId'] = races_df.index
races_df = races_df.iloc[:, [8, 0, 1, 2, 3]]
races_df = races_df[races_df['Year'] < 2020]
races_df.head(3)

Unnamed: 0,RaceIndexId,RaceId,Year,Round,CircuitId
0,0,833,1950,1,9
1,1,834,1950,2,6
2,2,835,1950,3,19


In [4]:
qualifying_df_temp = qualifying_df
qualifying_df_temp = qualifying_df_temp.iloc[:, [1, 2, 3, 5]]
qualifying_df_temp.head(3)

Unnamed: 0,RaceId,DriverId,ConstructorId,QualifyingPosition
0,18,1,1,1
1,18,9,2,2
2,18,5,1,3


In [5]:
final_data_set = pd.merge(races_df, qualifying_df_temp, on=['RaceId'], how='left')
beforePreprocessing = final_data_set.shape
final_data_set = final_data_set.dropna(subset = ['DriverId', 'ConstructorId'])
afterPreprocessing = final_data_set.shape
print("Before:", beforePreprocessing,"and After:", afterPreprocessing)
final_data_set.head(3)

Before: (8966, 8) and After: (8334, 8)


Unnamed: 0,RaceIndexId,RaceId,Year,Round,CircuitId,DriverId,ConstructorId,QualifyingPosition
548,548,257,1994,1,18,102.0,3.0,1.0
549,548,257,1994,1,18,30.0,22.0,2.0
550,548,257,1994,1,18,55.0,6.0,3.0


In [6]:
years_list = final_data_set['Year'].unique()
raceId_list = final_data_set['RaceId']
raceIndex_list = final_data_set['RaceIndexId']
final_data_set.head(3)

Unnamed: 0,RaceIndexId,RaceId,Year,Round,CircuitId,DriverId,ConstructorId,QualifyingPosition
548,548,257,1994,1,18,102.0,3.0,1.0
549,548,257,1994,1,18,30.0,22.0,2.0
550,548,257,1994,1,18,55.0,6.0,3.0


In [7]:
driver_standings_df['DriverPoints'] = driver_standings_df['DriverPoints'].astype(int)
driver_standings_df.rename(columns={'DriverPoints': 'TotalDriverPoints'}, inplace = True)
driver_standings_data = driver_standings_df.loc[:, ['RaceId', 'DriverId', 'TotalDriverPoints']]
driver_standings_data.head()

Unnamed: 0,RaceId,DriverId,TotalDriverPoints
0,18,1,10
1,18,2,8
2,18,3,6
3,18,4,5
4,18,5,4


In [8]:
driver_results_data = driver_results_df.loc[:, ['RaceId', 'DriverId', 'ConstructorId', 'GridNumber', 'DriverPosition']]
driver_results_data.head()

Unnamed: 0,RaceId,DriverId,ConstructorId,GridNumber,DriverPosition
0,18,1,1,1,1
1,18,2,2,5,2
2,18,3,3,7,3
3,18,4,4,11,4
4,18,5,1,3,5


In [9]:
filtered_driver_results = driver_results_data.loc[driver_results_data['RaceId'].isin(raceId_list)]
filtered_driver_results.head(3)

Unnamed: 0,RaceId,DriverId,ConstructorId,GridNumber,DriverPosition
0,18,1,1,1,1
1,18,2,2,5,2
2,18,3,3,7,3


In [10]:
results = pd.merge(final_data_set, filtered_driver_results, on=['RaceId', 'DriverId', 'ConstructorId'], how='left')
results.head(3)

Unnamed: 0,RaceIndexId,RaceId,Year,Round,CircuitId,DriverId,ConstructorId,QualifyingPosition,GridNumber,DriverPosition
0,548,257,1994,1,18,102.0,3.0,1.0,1.0,\N
1,548,257,1994,1,18,30.0,22.0,2.0,2.0,1
2,548,257,1994,1,18,55.0,6.0,3.0,3.0,3


In [11]:
driver_standings_df_temp = driver_standings_df.iloc[:, 1:4]
driver_standings_df_temp.head(3)
results_temp = pd.merge(results, driver_standings_df_temp, on = ['RaceId', 'DriverId'], how = 'left')
results_temp.head(3)

Unnamed: 0,RaceIndexId,RaceId,Year,Round,CircuitId,DriverId,ConstructorId,QualifyingPosition,GridNumber,DriverPosition,TotalDriverPoints
0,548,257,1994,1,18,102.0,3.0,1.0,1.0,\N,0.0
1,548,257,1994,1,18,30.0,22.0,2.0,2.0,1,10.0
2,548,257,1994,1,18,55.0,6.0,3.0,3.0,3,4.0


In [12]:
constructor_standings_df_temp = constructor_standings_df.iloc[:, 1:5]
constructor_standings_df_temp.head(3)

Unnamed: 0,RaceId,ConstructorId,ConstructorPoints,ConstructorPosition
0,18,1,14.0,1
1,18,2,8.0,3
2,18,3,9.0,2


In [13]:
results_latest = pd.merge(results_temp, constructor_standings_df_temp, on = ['RaceId', 'ConstructorId'], how = 'left')
results_latest.head(3)

Unnamed: 0,RaceIndexId,RaceId,Year,Round,CircuitId,DriverId,ConstructorId,QualifyingPosition,GridNumber,DriverPosition,TotalDriverPoints,ConstructorPoints,ConstructorPosition
0,548,257,1994,1,18,102.0,3.0,1.0,1.0,\N,0.0,6.0,2.0
1,548,257,1994,1,18,30.0,22.0,2.0,2.0,1,10.0,10.0,1.0
2,548,257,1994,1,18,55.0,6.0,3.0,3.0,3,4.0,4.0,3.0


In [14]:
results_latest['ConstructorPoints'].fillna(inplace=True, value=0)
results_latest['TotalDriverPoints'].fillna(inplace=True, value=0)
#Nan - GridNumber[nan], DriverPosition[\\N, nan], ConstructorPosition [nan]

In [15]:
results_latest.to_csv('Final_Preprocessed_Data.csv')

In [16]:
new_data_temp = results_latest.dropna()
results_latest.shape
new_data_temp.head(3)

Unnamed: 0,RaceIndexId,RaceId,Year,Round,CircuitId,DriverId,ConstructorId,QualifyingPosition,GridNumber,DriverPosition,TotalDriverPoints,ConstructorPoints,ConstructorPosition
0,548,257,1994,1,18,102.0,3.0,1.0,1.0,\N,0.0,6.0,2.0
1,548,257,1994,1,18,30.0,22.0,2.0,2.0,1,10.0,10.0,1.0
2,548,257,1994,1,18,55.0,6.0,3.0,3.0,3,4.0,4.0,3.0


In [17]:
new_data_temp = new_data_temp[new_data_temp.DriverPosition != '\\N']
new_data_temp.head(3)
new_data_temp.to_csv("Preprocessed _Data.csv")

In [18]:
final_sample_data = new_data_temp.iloc[:, [3, 4, 5, 6, 7, 8, 10,11, 12]]
X = final_sample_data
Y = new_data_temp['DriverPosition']
print("X",X.shape,"and Y:",Y.shape)

X (6156, 9) and Y: (6156,)


In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.01)
print("Train",X_train.shape,"and Test:",X_test.shape)

Train (6094, 9) and Test: (62, 9)


In [20]:
lr_model = LogisticRegression(solver='lbfgs')
model = lr_model.fit(X_train, Y_train)
Y_pred = lr_model.predict(X_test)
count_misclassified = (Y_test != Y_pred).sum()
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = metrics.accuracy_score(Y_test, Y_pred)
print('Accuracy: {:.2f}'.format(accuracy))



Misclassified samples: 53
Accuracy: 0.15




In [21]:
from sklearn.tree import DecisionTreeClassifier 
dtree_model = DecisionTreeClassifier(max_depth = 2).fit(X_train, Y_train) 
dtree_predictions = dtree_model.predict(X_test) 

count_misclassified = (Y_test != dtree_predictions).sum()
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = metrics.accuracy_score(Y_test, dtree_predictions)
print('Accuracy: {:.2f}'.format(accuracy))

Misclassified samples: 56
Accuracy: 0.10


In [22]:
from sklearn.neighbors import KNeighborsClassifier 
for i in range(1, 31):
    knn = KNeighborsClassifier(n_neighbors = i).fit(X_train, Y_train) 
    # accuracy on X_test 
    accuracy = knn.score(X_test, Y_test) 
    print ("Neighbors",(i+1), ":",accuracy) 

Neighbors 2 : 0.16129032258064516
Neighbors 3 : 0.0967741935483871
Neighbors 4 : 0.12903225806451613
Neighbors 5 : 0.1935483870967742
Neighbors 6 : 0.12903225806451613
Neighbors 7 : 0.12903225806451613
Neighbors 8 : 0.0967741935483871
Neighbors 9 : 0.04838709677419355
Neighbors 10 : 0.06451612903225806
Neighbors 11 : 0.08064516129032258
Neighbors 12 : 0.11290322580645161
Neighbors 13 : 0.0967741935483871
Neighbors 14 : 0.0967741935483871
Neighbors 15 : 0.12903225806451613
Neighbors 16 : 0.0967741935483871
Neighbors 17 : 0.14516129032258066
Neighbors 18 : 0.14516129032258066
Neighbors 19 : 0.14516129032258066
Neighbors 20 : 0.16129032258064516
Neighbors 21 : 0.1935483870967742
Neighbors 22 : 0.20967741935483872
Neighbors 23 : 0.1774193548387097
Neighbors 24 : 0.14516129032258066
Neighbors 25 : 0.12903225806451613
Neighbors 26 : 0.1774193548387097
Neighbors 27 : 0.1774193548387097
Neighbors 28 : 0.1774193548387097
Neighbors 29 : 0.1935483870967742
Neighbors 30 : 0.1935483870967742
Neighb

In [23]:
from sklearn.naive_bayes import GaussianNB 
gnb = GaussianNB().fit(X_train, Y_train) 
gnb_predictions = gnb.predict(X_test) 
  
# accuracy on X_test 
accuracy = gnb.score(X_test, Y_test) 
print (accuracy) 
  

0.12903225806451613


In [26]:
import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline


def baseline_model():
    model = Sequential()
    model.add(Dense(8, input_dim=9, activation='relu'))
    model.add(Dense(24, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=0)
kfold = KFold(n_splits=10, shuffle=True)
results = cross_val_score(estimator, X, Y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))



ValueError: Error when checking target: expected dense_18 to have shape (24,) but got array with shape (23,)