In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
races_df = pd.read_csv("races.csv", header=[0], encoding="ISO-8859-1")
qualifying_df = pd.read_csv("qualifying.csv", header=[0], encoding="ISO-8859-1")
driver_standings_df = pd.read_csv("driver_standings.csv", header=[0], encoding="ISO-8859-1")
constructor_standings_df = pd.read_csv("constructor_standings.csv", header=[0], encoding="ISO-8859-1")
driver_results_df = pd.read_csv("results.csv", header=[0], encoding="ISO-8859-1")
constructor_results_df = pd.read_csv("constructor_results.csv", header=[0], encoding="ISO-8859-1")

In [3]:
races_list = races_df.loc[:, ['RaceId', 'Year', 'Round', 'CircuitId']]
#REFERENCES: https://stackoverflow.com/questions/15891038/change-data-type-of-columns-in-pandas
races_list = races_list.apply(pd.to_numeric).sort_values(by=['Year', 'Round'])
race_filter = races_list['Year'].between(1950,2019)
races_list = races_list[race_filter]
print("RACES\n","Shape:", races_list.shape,"and Unique RaceIds:", races_list.RaceId.unique().shape)
races_list.Year.unique()
required_raceIds = races_list.RaceId.unique()
races_list.head()

RACES
 Shape: (1018, 4) and Unique RaceIds: (1018,)


Unnamed: 0,RaceId,Year,Round,CircuitId
832,833,1950,1,9
833,834,1950,2,6
834,835,1950,3,19
835,836,1950,4,66
836,837,1950,5,13


In [4]:
qualifying_data = qualifying_df.loc[:, ['RaceId', 'DriverId', 'ConstructorId', 'QualifyingPosition']]
qualifying_data = qualifying_data.apply(pd.to_numeric)
qualifying_data.RaceId.unique().shape

(386,)

In [5]:
#Select RaceIds for Races from 2008 to 2019
#REFERECES: https://stackoverflow.com/questions/17071871/how-to-select-rows-from-a-dataframe-based-on-column-values
qualifying_dataset = qualifying_data.loc[qualifying_data['RaceId'].isin(required_raceIds)]
print("QUALIFYING\n","Shape:", qualifying_dataset.shape,"and Unique RaceIds:", qualifying_dataset.RaceId.unique().shape)
qualifying_dataset = qualifying_dataset.sort_values(by=['RaceId', 'DriverId'])
qualifying_dataset.to_csv("QualfyingFilteredNew.csv")
qualifying_dataset.RaceId.unique().shape


QUALIFYING
 Shape: (8334, 4) and Unique RaceIds: (386,)


(386,)

In [6]:
driver_standings_df['DriverPoints'] = driver_standings_df['DriverPoints'].astype(int)
driver_standings_df.rename(columns={'DriverPoints': 'TotalDriverPoints'}, inplace = True)
driver_standings_data = driver_standings_df.loc[:, ['RaceId', 'DriverId', 'TotalDriverPoints']]
driver_standings_data.head()

Unnamed: 0,RaceId,DriverId,TotalDriverPoints
0,18,1,10
1,18,2,8
2,18,3,6
3,18,4,5
4,18,5,4


In [7]:
driver_results_data = driver_results_df.loc[:, ['RaceId', 'DriverId', 'ConstructorId', 'GridNumber', 'DriverPosition']]
driver_results_data.head()

Unnamed: 0,RaceId,DriverId,ConstructorId,GridNumber,DriverPosition
0,18,1,1,1,1
1,18,2,2,5,2
2,18,3,3,7,3
3,18,4,4,11,4
4,18,5,1,3,5


In [8]:
driver_results_dataset = driver_results_data.loc[driver_results_data['RaceId'].isin(required_raceIds)]
print("DRIVER RESULTS\n","Shape:", driver_results_dataset.shape,"and Unique RaceIds:", driver_results_dataset.RaceId.unique().shape)
#driver_results_dataset = driver_results_dataset.sort_values(by=['RaceId', 'DriverId'])
driver_results_dataset.to_csv("driverResultsNew.csv")
driver_results_dataset.head()

DRIVER RESULTS
 Shape: (24600, 5) and Unique RaceIds: (1017,)


Unnamed: 0,RaceId,DriverId,ConstructorId,GridNumber,DriverPosition
0,18,1,1,1,1
1,18,2,2,5,2
2,18,3,3,7,3
3,18,4,4,11,4
4,18,5,1,3,5


In [9]:
result = pd.merge(driver_results_dataset, qualifying_dataset, on=['RaceId', 'DriverId', 'ConstructorId'], how='left')
result.to_csv("Resultant Data New.csv")
result.head()

Unnamed: 0,RaceId,DriverId,ConstructorId,GridNumber,DriverPosition,QualifyingPosition
0,18,1,1,1,1,1.0
1,18,2,2,5,2,5.0
2,18,3,3,7,3,7.0
3,18,4,4,11,4,12.0
4,18,5,1,3,5,3.0


In [None]:
final_dataset = pd.read_csv('Resultant Data New.csv')
final_dataset.head()

In [None]:
new_finalSet = pd.merge(final_dataset, races_list, on = ['RaceId'], how = 'left')
new_finalSet = new_finalSet.iloc[:, [1, 2, 3, 4, 5, 6, 9]]
print("Shape:",new_finalSet.shape)
new_finalSet.head()

In [None]:
# REFERENCES: https://stackoverflow.com/questions/13148429/how-to-change-the-order-of-dataframe-columns
new_set = pd.merge(new_finalSet, driver_standings_data, on = ['RaceId', 'DriverId'], how = 'left')
new_set.fillna(0, inplace = True)
new_set['TotalDriverPoints'] = new_set['TotalDriverPoints'].astype(int)
new_order = [0,1,2, 3, 5, 6, 7, 4]
new_set = new_set[new_set.columns[new_order]]
new_set.head()

In [None]:
dataset_with_points = new_set
dataset_with_points.to_csv('DataSetWithPointsNew.csv')
dataset_with_points = dataset_with_points.iloc[:, 1:]
dataset_with_points.head()

In [None]:
dataset_with_points['DriverPosition'] = dataset_with_points['DriverPosition'].replace(['\\N'], '22')
filterData = dataset_with_points[dataset_with_points['DriverPosition'] != '\\N']
dataset_with_points['DriverPosition'] = dataset_with_points['DriverPosition'].astype(int)
filterData.shape

In [None]:
# REFERENCES: https://www.pluralsight.com/guides/importing-and-splitting-data-into-dependent-and-independent-features-for-ml
X = filterData.iloc[:, :-1].values
Y = filterData.iloc[:, -1].values
X_train, X_test, Y_Train, Y_Test = train_test_split(X, Y, test_size=0.1)
print("Train Data:",X_train.shape,"and Test Data:",X_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn import metrics

kfold = KFold(n_splits = 5)
lr_model = LogisticRegression(solver='lbfgs', max_iter=10000)
model = lr_model.fit(X_train, Y_Train)
Y_pred = lr_model.predict(X_train)
count_misclassified = (Y_Test != Y_pred).sum()
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = metrics.accuracy_score(Y_Test, Y_pred)
print('Accuracy: {:.2f}'.format(accuracy))

In [None]:
from sklearn.tree import DecisionTreeClassifier 
dtree_model = DecisionTreeClassifier(max_depth = 8).fit(X_train, Y_Train) 
dtree_predictions = dtree_model.predict(X_test) 

count_misclassified = (Y_Test != dtree_predictions).sum()
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = metrics.accuracy_score(Y_Test, dtree_predictions)
print('Accuracy: {:.2f}'.format(accuracy))

##### from sklearn.neighbors import KNeighborsClassifier 
knn = KNeighborsClassifier(n_neighbors = 20).fit(X_train, Y_Train) 
  
# accuracy on X_test 
accuracy = knn.score(X_test, Y_Test) 
print (accuracy) 
 