* Get the summary of the dataset
* Missing values handling
* Duplicate value handling
* Data distribution
* Outlier handling
    * Replace the outliers using median values
    * Remove outliers
    * Do not change outliers
* Correlation analysis
* Class balancing using SMOTE algorithm
* Feature Scaling
    * Standardization
    * Normalization
* Modeling with hyperparametric tuning
    * Logistic Regression Model
    * Naive Bayes
    * Neural Network

### 1) Import Libraries

In [1]:
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix , classification_report
import seaborn as sns
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

### 2) Data Preprocessing

#### 2.1) Importing Dataset

In [3]:
df = pd.read_csv('nba_rookie_data.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'nba_rookie_data.csv'

In [None]:
# drop unusual column
df = df.drop(['Name'], axis = 1)
df.head()

#### 2.2) Get the brief summary of the dataset

In [None]:
# Statistics of the dataset
df.describe()

In [None]:
# Get the information of the dataset
df.info()

#### 2.3) Missing values handling

In [None]:
missing_values = df.isna()

# Calculate the count of missing values in each column
missing_count = missing_values.sum()

# Calculate the percentage of missing values in each column
missing_percentage = (missing_count / len(df)) * 100

# Create a summary DataFrame
missing_info = pd.DataFrame({'Missing Values': missing_count, 'Missing Percentage': missing_percentage})

# Print or display the summary
print(missing_info)


There is small percentage of missin values only in one variable. Let's remove them and clean the dataset

In [None]:
# Create a heatmap to visualize missing values
plt.figure(figsize=(8, 6))
sns.heatmap(df.isnull(), cmap='viridis', cbar=False)
plt.title('Missing Values')
plt.show()

In [None]:
# remove missing value with entire row
data = df.dropna(how='any')

In [None]:
# Create a heatmap to visualize missing values
plt.figure(figsize=(8, 6))
sns.heatmap(data.isnull(), cmap='viridis', cbar=False)
plt.title('Missing Values')
plt.show()

#### 2.4) Duplicate value handling

In [None]:
duplicate_rows = data[data.duplicated(keep='first')]
# first occurrence of the duplicate and marks the rest as duplicates.
num_duplicate_rows = len(duplicate_rows)

print(f"Number of duplicate rows: {num_duplicate_rows}")

There are 11 duplicate rowsa in the dataset and we have to remove them

In [None]:
# Remove duplicate rows from the DataFrame
data = data.drop_duplicates(keep='first')

In [None]:
# again check the duplicate rows

duplicate_rows = data[data.duplicated(keep='first')]
# first occurrence of the duplicate and marks the rest as duplicates.
num_duplicate_rows = len(duplicate_rows)

print(f"Number of duplicate rows: {num_duplicate_rows}")

#### 2.5) Data distribution

* Density plots for all the variables

In [None]:
# Create subplots for each variable
plt.figure(figsize=(12, 21))  # Adjust the figure size as needed
for i, column in enumerate(data.columns):
    plt.subplot(7, 3, i + 1)  # Adjust the subplot layout as needed
    sns.histplot(data[column], kde=True)
    plt.title(f'Distribution of {column}')

plt.tight_layout()
plt.show()

Apparently variables are not following the normal distribution

* Distribution of response variable

#### 2.6) Outlier handling

* Outlier detection from boxplots

In [None]:
data.iloc[:,0:20].plot(kind="box", subplots=True, layout=(6,4), figsize=(10,25))

* Take the percentage of outliers for each variable

In [None]:
# Define a function to calculate the percentage of outliers
def percentage_outliers(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = series[(series < lower_bound) | (series > upper_bound)]
    return len(outliers) / len(series) * 100

# Calculate the percentage of outliers for each variable
outlier_percentages = data.apply(percentage_outliers)

# Display the result
print(outlier_percentages)


There are significant outliers for the most of the variables and we have to handle it

* Replace outliers with the median


In [None]:
""""
# replace the outliers with median value
def replace_outliers_with_median(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    median_value = series.median()
    series[(series < lower_bound) | (series > upper_bound)] = median_value
    return series

# Apply the function to replace outliers for each variable
data_new = data.apply(replace_outliers_with_median)
""""

In [None]:
data_new = data

In [None]:
data_new.describe()

* Removing Outliers

In [None]:
""""
data_new = data.iloc[:,:]

def outliers_for_features(data, col):    
    Q1 = data.loc[:,col].quantile(0.25)
    Q3 = data.loc[:,col].quantile(0.75)
    
    upper_limit = Q3 + 1.5*(Q3-Q1)
    lower_limit = Q1 - 1.5*(Q3-Q1)
    
    return data_new[col].apply(lambda x : upper_limit if x > upper_limit else lower_limit if x < lower_limit else x)
    
for col in df.iloc[:,1:19]:
    data_new[col] = outliers_for_features(data, col)
    
""""

In [None]:
# original dataset quantile information
df.describe()

#### 2.7) Correlation analysis

* Correlation of each variable with response variable

In [None]:
# Calculate the correlation matrix
corr_matrix = data_new.corr()

# Create a bar chart to display the correlations
plt.figure(figsize=(12, 8))  # Set the figure size
sns.set(style="whitegrid")  # Set the style of the chart

# Create a bar chart using a horizontal barplot
sns.barplot(x=corr_matrix.index, y=corr_matrix["TARGET_5Yrs"], palette="viridis")

# Customize the chart
plt.title("Pairwise Correlation Bar Chart")
plt.xlabel("Variables")
plt.ylabel("Correlation")
plt.xticks(rotation=90)  # Rotate x-axis labels for better readability

plt.show()

* pair plot to check how the correlation applies between each variable

In [None]:
corr_matrix = data_new.corr()

# Create a heatmap
plt.figure(figsize=(20, 20))
sns.set(font_scale=1.2)  # Adjust font size for better readability

# Create a heatmap with the correlation matrix
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True, square=True)

# Customize the plot
plt.title("Correlation Heatmap")
plt.show()

#### 2.8) Class balancing

* Check the distribution of the response variable

In [None]:
sns.countplot(x='TARGET_5Yrs', data=data_new)
plt.title('Class Distribution of Response Variable')
plt.xlabel('Response Variable')
plt.ylabel('Count')
plt.show()

Here, classes of the response varibale doesn't balance. We have to balance the response variable.

* Balance response variable using SMOTE ( Syntetic Minority Oversampling Technique ) technique

In [None]:
X = data_new.drop('TARGET_5Yrs', axis=1)
y = data_new['TARGET_5Yrs']

smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
balanced_df = pd.concat([X_resampled, y_resampled], axis=1)

In [None]:
balanced_df.head()

In [None]:
sns.countplot(x='TARGET_5Yrs', data=balanced_df)
plt.title('Class Distribution of Response Variable')
plt.xlabel('Response Variable')
plt.ylabel('Count')
plt.show()

#### 2.9) Feature scaling

* Standardizing

In [None]:
X_std = balanced_df.drop('TARGET_5Yrs', axis=1)
y = balanced_df['TARGET_5Yrs']

In [None]:
X_std.head()

In [None]:
# scale the dataset
X_std = StandardScaler().fit_transform(X_std)

In [None]:
pd.DataFrame(X_std).head()

* Normalization

In [None]:
X_nor = balanced_df.drop('TARGET_5Yrs', axis=1)
y = balanced_df['TARGET_5Yrs']

In [None]:
# scale the dataset
X_nor = MinMaxScaler().fit_transform(X_nor)

In [None]:
pd.DataFrame(X_nor).head()

### 3) Modeling and Hyperparametric Tuning

#### 3.1) Modeling using standard scaling data

Split the dataset to train and evaluation. Use 'GridSearchCV' approach to training set and evaluate it using testing set

In [None]:
X_train_std, X_test_std, y_train_std, y_test_std = train_test_split(X_std, y, test_size= 1/4, random_state=0)

* Modeling using 'GridSearchCV' method with 5 folds

Let's tune some special parameters of each of the algorithm

#### 3.1.1) Logistic Regression Model

In [None]:
# create the model and fit
lr_std = GridSearchCV(LogisticRegression(), {
    'penalty': ['elasticnet','l1','l2'],
    'max_iter':[50,100,25]
}, cv=5, return_train_score=False)

lr_std.fit(X_train_std, y_train_std)

In [None]:
# best model after doing hyper parametric tuning
best_model_lr_std = lr_std.best_estimator_
best_model_lr_std

In [None]:
# best accuracy of the tuned model for training data
lr_std.best_score_

In [None]:
lr_std.best_params_

In [None]:
# predict the test data response variable
y_predicted_lr_std = best_model_lr_std.predict(X_test_std)

In [None]:
# predict the probability value of response variable
best_model_lr_std.predict_proba(X_test_std)

* Evaluating the model

Let's evaluate model using testing set

In [None]:
# output the accuracy score
print('Our Accuracy is %.2f' % best_model_lr_std.score(X_test_std, y_test_std))

# output the number of mislabeled points
print('Number of mislabeled points out of a total %d points : %d'% (X_test_std.shape[0], (y_test_std != best_model_lr_std.predict(X_test_std)).sum()))

In [None]:
cm_lr_std = confusion_matrix(y_test_std,y_predicted_lr_std)

# Create a heatmap to visualize the confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm_lr_std, annot=True, fmt="d", cmap="YlGnBu")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

* 1: if career length >= 5 yrs
* 0: if career length < 5 yrs

In [None]:
# confusion matrix
print(classification_report(y_test_std, y_predicted_lr_std))

#### 3.1.2) Naive Bayes

In [None]:
nb_std = GridSearchCV(GaussianNB(), {
}, cv=5, return_train_score=False)

nb_std.fit(X_train_std, y_train_std)

In [None]:
# best model after doing hyper parametric tuning for training data
best_model_nb_std = nb_std.best_estimator_
best_model_nb_std

In [None]:
# best accuracy of the tuned model for training data
nb_std.best_score_

In [None]:
# best parameters for the tuned model using training data
nb_std.best_params_

* Evaluating the model

Let's evaluate the model using testing data

In [None]:
# output the accuracy score
print('Our Accuracy is %.2f' % nb_std.score(X_test_std, y_test_std))

# output the number of mislabeled points
print('Number of mislabeled points out of a total %d points : %d'% (X_test_std.shape[0], (y_test_std != nb_std.predict(X_test_std)).sum()))

In [None]:
# predict the probability value of response variable for testing data
best_model_nb_std.predict_proba(X_test_std)

In [None]:
# predict the test data response variable
y_predicted_nb_std = best_model_nb_std.predict(X_test_std)

In [None]:
cm_nb_std = confusion_matrix(y_test_std,y_predicted_nb_std)

# Create a heatmap to visualize the confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm_nb_std, annot=True, fmt="d", cmap="YlGnBu")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
# confusion matrix
print(classification_report(y_test_std, y_predicted_nb_std))

#### 3.1.3) Neural Network

In [None]:
mlp_std = GridSearchCV(MLPClassifier(), {
    'hidden_layer_sizes': [[10,20,30],[20,30,40],[50,60,70]],
    #'activation': ['identity','logistic','tanh','relu'],
    'solver': ['sgd','adam'],
    #'alpha' : [0.0001,0.001,0.01],
    #'learning_rate': ['constant','invscaling','adaptive'],
    #'learning_rate_init': [0.0001,0.001,0.01],
    #'max_iter': [200,300,400]
}, cv=5, return_train_score=False)

mlp_std.fit(X_train_std, y_train_std)

In [None]:
# best model after doing hyper parametric tuning for training data
best_model_mlp_std = mlp_std.best_estimator_
best_model_mlp_std

In [None]:
# best accuracy of the tuned model for training data
mlp_std.best_score_

In [None]:
# best parameters for the tuned model using training data
mlp_std.best_params_

* Evaluating the model

Let's evaluating the model using testing data

In [None]:
# output the accuracy score
print('Our Accuracy is %.2f' % mlp_std.score(X_test_std, y_test_std))

# output the number of mislabeled points
print('Number of mislabeled points out of a total %d points : %d'% (X_test_std.shape[0], (y_test_std != mlp_std.predict(X_test_std)).sum()))

In [None]:
# predict the probability value of response variable for testing data
best_model_mlp_std.predict_proba(X_test_std)

In [None]:
# predict the test data response variable
y_predicted_mlp_std = best_model_mlp_std.predict(X_test_std)

In [None]:
cm_mlp_std = confusion_matrix(y_test_std,y_predicted_mlp_std)

# Create a heatmap to visualize the confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm_mlp_std, annot=True, fmt="d", cmap="YlGnBu")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
# confusion matrix
print(classification_report(y_test_std, y_predicted_mlp_std))

#### 3.2) Modeling using normalize data

In [None]:
# split the dataset
X_train_nor, X_test_nor, y_train_nor, y_test_nor = train_test_split(X_nor, y, test_size= 1/4, random_state=0)

#### 3.2.1) Logistic regression model

In [None]:
# create the model and fit
lr_nor = GridSearchCV(LogisticRegression(), {
    'penalty': ['elasticnet','l1','l2'],
    'max_iter':[50,100,25]
}, cv=5, return_train_score=False)

lr_nor.fit(X_train_nor, y_train_nor)

In [None]:
# best model after doing hyper parametric tuning
best_model_lr_nor = lr_nor.best_estimator_
best_model_lr_nor

In [None]:
# best accuracy of the tuned model for training data
lr_nor.best_score_

In [None]:
lr_nor.best_params_

* Evaluating the model

Let's evaluate the model using testing data

In [None]:
# predict the probability value of response variable
best_model_lr_nor.predict_proba(X_test_nor)

In [None]:
# predict the test data response variable
y_predicted_lr_nor = best_model_lr_nor.predict(X_test_nor)

In [None]:
# output the accuracy score
print('Our Accuracy is %.2f' % best_model_lr_nor.score(X_test_nor, y_test_nor))

# output the number of mislabeled points
print('Number of mislabeled points out of a total %d points : %d'% (X_test_nor.shape[0], (y_test_nor != best_model_lr_nor.predict(X_test_nor)).sum()))

In [None]:
cm_lr_nor = confusion_matrix(y_test_nor,y_predicted_lr_nor)

# Create a heatmap to visualize the confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm_lr_std, annot=True, fmt="d", cmap="YlGnBu")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
# confusion matrix
print(classification_report(y_test_nor, y_predicted_lr_nor))

3.2.2) Naive Bayes Model

In [None]:
nb_nor = GridSearchCV(GaussianNB(), {
}, cv=5, return_train_score=False)

nb_nor.fit(X_train_nor, y_train_nor)

In [None]:
# best model after doing hyper parametric tuning for training data
best_model_nb_nor = nb_nor.best_estimator_
best_model_nb_nor

In [None]:
# best accuracy of the tuned model for training data
nb_nor.best_score_

In [None]:
# best parameters for the tuned model using training data
nb_nor.best_params_

* Evaluating the model

Let's evaluate the model using testing data

In [None]:
# output the accuracy score
print('Our Accuracy is %.2f' % nb_nor.score(X_test_nor, y_test_nor))

# output the number of mislabeled points
print('Number of mislabeled points out of a total %d points : %d'% (X_test_nor.shape[0], (y_test_nor != nb_nor.predict(X_test_nor)).sum()))

In [None]:
# predict the probability value of response variable for testing data
best_model_nb_nor.predict_proba(X_test_nor)

In [None]:
# predict the test data response variable
y_predicted_nb_nor = best_model_nb_nor.predict(X_test_nor)

In [None]:
cm_nb_nor = confusion_matrix(y_test_nor,y_predicted_nb_nor)

# Create a heatmap to visualize the confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm_nb_nor, annot=True, fmt="d", cmap="YlGnBu")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
# confusion matrix
print(classification_report(y_test_nor, y_predicted_nb_nor))

3.2.3) Neural Network

In [None]:
mlp_nor = GridSearchCV(MLPClassifier(), {
    'hidden_layer_sizes': [[10,20,30],[20,30,40],[50,60,70]],
    #'activation': ['identity','logistic','tanh','relu'],
    'solver': ['lbfgs','sgd','adam'],
    #'alpha' : [0.0001,0.001,0.01],
    #'learning_rate': ['constant','invscaling','adaptive'],
    #'learning_rate_init': [0.0001,0.001,0.01],
    #'max_iter': [200,300,400]
}, cv=5, return_train_score=False)

mlp_nor.fit(X_train_nor, y_train_nor)

In [None]:
# best model after doing hyper parametric tuning for training data
best_model_mlp_nor = mlp_nor.best_estimator_
best_model_mlp_nor

In [None]:
# best accuracy of the tuned model for training data
mlp_nor.best_score_

In [None]:
# best parameters for the tuned model using training data
mlp_nor.best_params_

* Evaluating the model

Let's evaluating the model using testing data

In [None]:
# output the accuracy score
print('Our Accuracy is %.2f' % mlp_nor.score(X_test_nor, y_test_nor))

# output the number of mislabeled points
print('Number of mislabeled points out of a total %d points : %d'% (X_test_nor.shape[0], (y_test_nor != mlp_nor.predict(X_test_nor)).sum()))

In [None]:
# predict the probability value of response variable for testing data
best_model_mlp_nor.predict_proba(X_test_nor)

In [None]:
# predict the test data response variable
y_predicted_mlp_nor = best_model_mlp_nor.predict(X_test_nor)

In [None]:
cm_mlp_nor = confusion_matrix(y_test_nor,y_predicted_mlp_nor)

# Create a heatmap to visualize the confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm_mlp_nor, annot=True, fmt="d", cmap="YlGnBu")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
# confusion matrix
print(classification_report(y_test_nor, y_predicted_mlp_nor))