In [None]:
import pandas as pd

In [128]:
csv_path = './resources/dataset.csv'
df = pd.read_csv(csv_path)

In [None]:
print(df.info())

In [None]:
df.describe()

In [None]:
null_columns = df.isnull().sum()
null_columns = null_columns[null_columns > 0]
print(null_columns)
# null_columns means the columns having null values

In [None]:
# finding the missing values in the dataset
nan_columns = df.isna().count(True)
#nan_columns = nan_columns[nan_columns > 0]
print(nan_columns)
# nan_columns means the columns having missing values

In [None]:
duplicate_count = df.duplicated().sum()
print(duplicate_count)

B....CLEANING  

In [None]:
# drop the duplicate rows
df = df.drop_duplicates()
print(df.info())

In [None]:
#drop the rows if the row has missing value in attrition column
df = df.dropna(subset=['Attrition'])
print(df.info())


In [None]:
# fill the missing values with the mean of the column for other colummns
df = df.fillna(df.mean(numeric_only=True))
print(df.info())

C......SPLITING

In [104]:
feature_df = df.drop(columns=['Attrition'])  
label_df = df['Attrition'] 

In [None]:
feature_df.head()

In [None]:
label_df.head()

In [None]:
# divide the feature_df into categorical and numerical features
categorical_feature_df = feature_df.select_dtypes(include=['object']).copy()
categorical_feature_df.head()

In [None]:
numerical_feature_df = feature_df.select_dtypes(exclude=['object']).copy()
numerical_feature_df.head()

D......ENCODING

In [109]:
is_one_hot_encoded = True

In [110]:
# one hot encoding of catergorical columns where the columns are converted into neumerical values
def one_hot_encoding(df):
    #catergorical columns
    cat_columns = df.select_dtypes(include=['object']).columns
    #print(cat_columns)
    # print the number of values for those columns
    print(df[cat_columns].nunique())

    # encode such columns that have more than 2 values
    one_hot_columns = df[cat_columns].nunique()[df[cat_columns].nunique() > 2].index
    #other columns will be label encoded
    label_columns = df[cat_columns].nunique()[df[cat_columns].nunique() <= 2].index

    # print the columns that will be one hot encoded
    print("columns that will be one hot encoded:",one_hot_columns)

    # at first encode the label columns
    from sklearn.preprocessing import LabelEncoder
    label_encoder = LabelEncoder()
    for column in label_columns:
        df[column] = label_encoder.fit_transform(df[column])
    
    # now encode the one hot columns
    encoded_df = pd.get_dummies(df, columns=one_hot_columns)

    # print total number of catergorical columns that will be one hot encoded
    print("total encoded columns created:",df[cat_columns].nunique().sum())
    
    return encoded_df



In [111]:
def label_encoding(df):
    from sklearn.preprocessing import LabelEncoder
    encoder = LabelEncoder()
    categorical_cols = df.select_dtypes(include=['object', 'category', 'bool']).columns
    #print(categorical_cols)
    # a new df to store the encoded values
    encoded_df = df.copy()
    # which columns are being encoded and which value is encoded into which, also should be printed
    for col in categorical_cols:
        encoded_df[col] = encoder.fit_transform(df[col])
        print(f'{col}: {dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))}')
    return encoded_df


In [None]:
if is_one_hot_encoded:
    encoded_df = one_hot_encoding(feature_df)
else:
    encoded_df = label_encoding(feature_df)
encoded_df.head()

E.....SCALING

In [113]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [114]:
# binary columns should not be standard scaled ( one hot enconded columns are binary columns)

def standard_scaling(df):
    # after encoding, the categorical columns are converted into numerical values, but if one hot encoding is used, the values are not scaled
    scaler = StandardScaler()
    # find the columns that have only 0 and 1 as values
    binary_cols = [col for col in df.columns if df[col].nunique() == 2]

    scaled_df = df.copy()

    # scale EXCEPT the binary columns-------------------------------------------------------
    
    scaled_df.loc[:, df.columns.difference(binary_cols)] = scaler.fit_transform(df.loc[:, df.columns.difference(binary_cols)])

    # scale with all columns----------------------------------------------------------------
    #scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

    # only show the columns that are scaled
    print(scaled_df.columns.difference(binary_cols))
    # count the number of binary columns
    print(len(binary_cols))
    

    # return the scaled dataframe with the binary columns

    return scaled_df

In [115]:
def min_max_scaling(df):
    scaler = MinMaxScaler()
    scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    # which columns are scaled and what are the min and max values of the columns should be printed by name of every column
    for col in df.columns:
        print(f'{col}: min={df[col].min()}, max={df[col].max()}')
    return scaled_df

In [None]:
scaled_df = standard_scaling(encoded_df)
scaled_df.head()

PROCESSING THE LABEL DATAFRAME

In [None]:
# make label df a pd dataframe
label_df = pd.DataFrame(label_df)
label_df.head()

In [None]:
# label encoding od label
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
label_df['Attrition'] = encoder.fit_transform(label_df['Attrition']) 
label_df

F........CORRELATION ANALYSIS

In [None]:
# correlation values with respect to the label
correlation = scaled_df.corrwith(label_df['Attrition'])
print(correlation)


In [None]:
# plot the correlation values
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 10))
sns.barplot(x=correlation.values, y=correlation.index)
plt.title('Correlation with Attrition')
plt.show()


In [None]:
# find top 20 columns with highest correlation values keep negatives ones negative
correlation = correlation.abs().sort_values(ascending=False)
top = correlation.head(20)
top


In [None]:
#plot the highest 20 correlation values
plt.figure(figsize=(10, 10))
sns.barplot(x=top.values, y=top.index)
plt.title('Top Correlations with Attrition')
plt.show()

In [90]:
#correlation matrix between the features


# correlation_matrix = scaled_df.corr()
# plt.figure(figsize=(20, 20))
# sns.heatmap(correlation_matrix, annot=True, fmt='.2f')
# plt.title('Correlation Matrix')
# plt.show()


In [None]:
# create a new df merging the scaled_df and label_df
scattered_df = pd.concat([scaled_df, label_df], axis=1)
scattered_df.head()

In [None]:
import matplotlib.pyplot as plt
import numpy as np 

# # Select the top 20 columns with the highest correlation with the target variable


# Separate the scattered_df based on numeric labels of label_df
class_0 = scattered_df[scattered_df['Attrition'] == 0]
class_1 = scattered_df[scattered_df['Attrition'] == 1]


# 1D scatter plots for the top features for different classes of attrition
plt.figure(figsize=(40, 40))
for i, col in enumerate(top.index):
    plt.subplot(5, 6, i+1)
    plt.scatter(class_0[col], np.zeros_like(class_0[col]), label='0', alpha=0.5)
    plt.scatter(class_1[col], np.ones_like(class_1[col]), label='1', alpha=0.5)
    plt.xlabel(col)
    plt.legend()
    plt.title(col)


In [None]:
# drop other columns except top 20 columns
selected_features = scaled_df[top.index]
selected_features.head()


G......TRAIN A MODEL

In [None]:
# now, do logistic regression with the selected features and label
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(selected_features, label_df, test_size=0.2, random_state=4)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')