In [None]:
# Importing essential libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import plotly.express as px
import imblearn
import seaborn as sns
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [None]:
# Reading the data
df_orig = pd.read_csv('Training Data.csv')

# Presenting data in the dataframe.
df_orig.head()

**1. Show overall descriptive statistics of your dataset; number of data points, number of descriptive features, type of features, your target feature, and its type. (10 points)**

In [None]:
# Number of records present currently in the dataframe. Here 252000 indicates the number of rows 
# and 13 indicates the number of columns present.
df_orig.shape

In [None]:
# Display all the columns in the dataframe along with respective data type.
df_orig.info()

In [None]:
# Standardising the Feature names in the dataframe.
df_orig.rename(columns={'CITY':'City','STATE':'State','CURRENT_JOB_YRS':'Current_Job_Years',
                   'CURRENT_HOUSE_YRS':'Current_House_Years'},inplace=True)
(df_orig.columns).tolist()

**The following dataframe is enlightening us more about the dataset. The types of features are described with respect to the datatype as well as the category to which each feature is belonging to.**

In [None]:
# Data description of all features present in the file.
data_desc = pd.read_csv('Data_Dictionary.csv')
data_desc

In [None]:
# Exclude target label from given features in analysis
df = df_orig.iloc[:,:-1]
df.head()

In [None]:
# Segragating numerical features.
numerical_vars = data_desc[(data_desc['Data_Type']=='int') & (data_desc['Column_Name'] != 'Risk_Flag')]['Column_Name']
numerical_vars.reset_index(drop=True, inplace=True)
numerical_vars

In [None]:
# Segragating string variables, all of which are categorical
string_vars = data_desc[data_desc['Data_Type']=='string']['Column_Name']
string_vars.reset_index(drop=True, inplace=True)
string_vars

**The statistical information of the numerical features in our dataset is as follows:-**

In [None]:
# Statistical Information about the numeric data
desc_numeric = pd.DataFrame()

for var in numerical_vars:
    temp_df = pd.DataFrame()
    temp_df["Feature"] = [var]
    temp_df["Total Count"] = len(df[var])
    temp_df["Null Value Count"] = [df[var].isnull().sum()]
    temp_df["Cardinality"] = [df[var].nunique()]
    temp_df["Maximum Value"] = [df[var].max()]
    temp_df["Mininum Value"] = [df[var].min()]
    temp_df["Q1"] = [df[var].quantile(0.25)]
    temp_df["Mean Value"] = [df[var].mean()]
    temp_df["Q3"] = [df[var].quantile(0.75)]
    temp_df["Std. Dev"] = [df[var].std()]     
    desc_numeric = pd.concat([desc_numeric,temp_df])
    
desc_numeric

**The statistical information for all the features which are categorical and of string type.**

In [None]:
# Statistical Information about the string data
desc_string = pd.DataFrame()

for var in string_vars:
    temp_df = pd.DataFrame()
    temp_df["Feature"] = [var]
    temp_df["Total Count"] = len(df[var])
    temp_df["Null Value Count"] = [df[var].isnull().sum()]
    temp_df["Cardinality"] = [df[var].nunique()]
    temp_df["Mode Value"] = [df[var].mode()[0]]
    temp_df["Mode Frequency"] = [(df[var]==temp_df["Mode Value"][0]).sum()]
    temp_df["Mode Ratio"] = [(df[var]==temp_df["Mode Value"][0]).sum()]/temp_df["Total Count"]    
    desc_string = pd.concat([desc_string,temp_df])
    
desc_string

**2. Explore your features further in their distributions and plot their box plots. Show outliers for each feature. Do you think any of the outliers may impact your analysis? Why? Provide supporting visualizations with their analysis. (20 points)**

#### Plotting Histograms of all Numeric features along with their probability density

In [None]:
df.corr()

In [None]:
# To study Data Distribution of Numeric columns. Excluding the 'Id' column as it has unique values for each entry.

numerical_vars = numerical_vars[1:]
for num in numerical_vars:
    sns.set(style="darkgrid")
    fig, axs = plt.subplots(1, 1, figsize=(7, 7))
    sns.histplot(data=df[numerical_vars], x=num,kde=True).set_title("Histogram for {}".format(num),fontsize=14,fontweight='bold')

### Plotting Bar Graphs for portraying the distribution of some of the Categorical Variables.

In [None]:
# Excluding the City, Profession and State columns in depicting the frequency distribution of String Variables.
string_vars = string_vars[0:3]

for val in string_vars:
    sns.set(style="darkgrid")
    fig, axs = plt.subplots(1, 1, figsize=(8, 8))
    df[val].value_counts().plot(kind='bar',ylabel="Frequency",color='#69b3a2',rot=0,fontsize=12).set_title("Bar Plot for {} ".format(val),fontsize=14,fontweight='bold')

We have conmputed the boxplots for all the numeric features which are present in our dataset. As evident from the below Boxplots there aren't any outliers present in our dataset. None of the input datapoints which will be passed on the our model eventually are more than 1.5(IQR) above the upper quartile or more than 1.5(IQR) below the lower quartile. We do not possessany data point which is conspicuously diverse from the rest of the dataset.

In [None]:
# Boxplot created for numeric features using seaborn
for num in numerical_vars:
    sns.set(style="darkgrid")
    fig, axs = plt.subplots(1, 1, figsize=(7, 7))
    sns.boxplot(data=df[num],palette='deep').set_title("Boxplot for {} ".format(num),fontsize=14,fontweight='bold')

In [None]:
#Visualising the data set with all the numerical variables
sc_x = MinMaxScaler()
pd.DataFrame(sc_x.fit_transform(df[numerical_vars]),columns=df[numerical_vars].columns).plot(kind='box',figsize=(16,8))

**4. What data pre-processing do you apply? E.g., encoding features, missing values, scaling, etc. Explain each process and why you use it. (10 points)**

### Data Preprocessing:

**In order to remove inconsistencies in the categorical(texual) features, we have created a function which will eliminate numbers if not deemed necessary for that particular feature. It will also clean the additional parentheses, punctuation and any special characters. We have then added the refined data back to our dataframe.**

In [None]:
#data_desc[data_desc['Data_Type']=='string']['Column_Name']

In [None]:
temp_str_df = pd.DataFrame()
temp_unclean_cols = data_desc[data_desc['Data_Type']=='string']['Column_Name']
temp_str_df = df[temp_unclean_cols[3:]]
temp_str_df.head()

In [None]:
#df[temp_unclean_cols]

In [None]:
def clean_text(text):
    text = text.encode('ascii', errors='ignore').decode('utf8')
    text = re.sub(r"[()\[\]]" , "" , text)
    text = re.sub(r"[0-9]" , "" , text)   
    text = re.sub(r"[_]" , " " , text)
    return (text)

In [None]:
temp_cols = temp_unclean_cols[3:]
for i in temp_cols:
    temp_str_df[i] = temp_str_df[i].apply(clean_text)

temp_str_df['Id'] = df_orig['Id']
temp_str_df.head()

In [None]:
df_begin = df.iloc[:,0:7]
df_end = df.iloc[:,[0,-2,-1]]

df_refined = df_begin.merge(temp_str_df,on='Id',how='inner')
df_refined = df_refined.merge(df_end,on='Id',how='inner')
df_refined.head()

In [None]:
px.treemap(df_refined, values = "Income", path = ["Current_Job_Years", "Profession"], hover_name = "Profession", color = "Experience")

In [None]:
px.sunburst(df_refined, values = "Current_House_Years", color = "Age", 
            path = ["House_Ownership", "Current_House_Years", "Age"], color_continuous_scale = "tealgrn")

In [None]:
px.histogram(df_refined, x = "Married/Single", hover_name = "Age", color = "Car_Ownership",nbins=20)

In [None]:
india_json = r'India_Geodata.json'

fig = px.choropleth(
    df_refined,
    geojson = india_json,
    featureidkey = 'properties.ST_NM',
    locations = 'State',
    color = 'Income',
    color_continuous_scale = 'Reds'
)

fig.update_geos(fitbounds="locations", visible=False)

fig.show()

In [None]:
df_str = df_refined[['Married/Single', 'House_Ownership', 'Car_Ownership','State']]

### Label Encoding
** We have performed encoding on categorical variables**


In [None]:
lencoder = {}
for i in df_str.columns:
    label_encoder = preprocessing.LabelEncoder()
    df_str[i] = label_encoder.fit_transform(df_str[i])
    lencoder[i] = label_encoder

In [None]:
df_str.head(5)

### One-Hot Encoding:

**As we are aware that Scikit learn models work only on numeric data, we have encoded the necessary categorical features to numeric format using 'One-Hot' Encoding technique. This technique will convert our textual data to an array of 0s and 1s.**

In [None]:
oheencoder = {}
col_names = []
df_ohe = pd.DataFrame()
dfs=[]
for i in df_str.columns:
    enc = preprocessing.OneHotEncoder()
    col_names = [str(i)+'_'+str(x) for x in lencoder[i].classes_]
    df_temp = pd.DataFrame(enc.fit_transform(df_str.loc[:,[i]]).toarray(),columns=col_names)
    oheencoder[i] = enc
    dfs.append(df_temp)
df_ohe = pd.concat(dfs,axis=1)

In [8]:
df_ohe.head(5).iloc[:,:4]

NameError: name 'df_ohe' is not defined

In [None]:
# now we will combine the numberical and one hot encoded vectors
df_processed = pd.concat([df_refined[numerical_vars], df_ohe], axis = 1)
df_processed

**5. Analyze the balance or distribution of your target variable. Do you think any of these will present a problem and why? Provide supporting visualizations with their analysis. (10 points)**

In [None]:
#we are checking the distribution of our target variable
plt.figure(figsize=(10,6))
plt.xticks(fontsize=14)
plt.xticks(fontsize=14)
plt.xlabel('Risk_Flag',fontsize=16)
plt.ylabel('Count',fontsize=16)
ax=sns.countplot(x=df_orig['Risk_Flag'], data=df_orig)
for p in ax.patches:
    ax.annotate(f'\n{p.get_height()}', (p.get_x()+0.38, p.get_height()), ha='center', va='top', color='white', size=18)

**The above countplot exhibits the distribution of our target variable 'Risk_Flag'. Unfortunately we are posed with imbalanced data for the target variable. This may influence the learning of our model/s and may provide us with biased results. We will explore that shortly.**

**6. What kind of ML approaches and algorithms do you take and why? E.g., supervised,
regression, classification, binary, multi-class, split rate of data, logistic regression, SVM,
decision trees etc. Provide supporting visualizations with their analysis. (10 points)**

In [None]:
x = df_processed.values
y = df_orig.Risk_Flag.values

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

## Scaling:

**The scaling technique that we have used which will scale our near final dataset is Scikit learn's StandardScaler. It standardises features by calculating the mean and scaling to unit varaince, so that if the data possesses high variance then the scaler would scale down the data to fit it within the range of 0 to 1. Performing scaling refrains the features with high variance from biasing the learning of the model.**

### Standard Scaling

In [9]:
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)

NameError: name 'x_train' is not defined

### Correlation Matrix - Feature Selection:

In [None]:
# Heatmap showing the correlation between the features

plt.figure(figsize = (15, 8))
hm = sns.heatmap(df_refined.corr(), vmin = -1, vmax = 1, annot = True, cmap = 'RdBu')
hm.set_title('Feature Correlation', fontdict = {'fontsize' : 18}, pad = 12)
plt.show()

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
classifier=LogisticRegression(random_state = 0)
classifier.fit(x_train,y_train)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score


def get_results(classifier):
    y_pred=classifier.predict(x_test)
    pred_prob1 = classifier.predict_proba(x_test)
    fpr, tpr, thresh = roc_curve(y_test, pred_prob1[:,1], pos_label=1)
    cm=confusion_matrix(y_test,y_pred)
    plt.figure(figsize=(8,5))
    sns.heatmap(cm,annot=True,fmt="d")
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    print(classification_report(y_test, y_pred))
    auc = round(roc_auc_score(y_test, pred_prob1[:,1]),2)
    print('AUC Score is - ',auc)
    f1 = round(f1_score(y_test, y_pred, average='macro'),2) 
    return (fpr, tpr, thresh, f1)

**Performance Metrics for Logistic Regression**

In [None]:
fpr1, tpr1, thresh1, f1 = get_results(classifier)

### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(x_train,y_train.flatten())

**Performance metrics using Decision Tree Classifier**

In [None]:
fpr2, tpr2, thresh2, f2 = get_results(classifier)

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100, random_state = 0)
classifier.fit(x_train,y_train.flatten())

**Performance metrics using Random Forest Classifier***

In [None]:
fpr3, tpr3, thresh3, f3 = get_results(classifier)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
classifier = GradientBoostingClassifier(learning_rate=0.1)
classifier = classifier.fit(x_train,y_train.flatten())

In [None]:
fpr4, tpr4, thresh4, f4 = get_results(classifier)

In [None]:
import xgboost as xgb
classifier = xgb.XGBClassifier(learning_rate = 0.1)
classifier = classifier.fit(x_train,y_train.flatten())

In [None]:
fpr5, tpr5, thresh5, f5 = get_results(classifier)

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf

In [None]:
from sklearn.preprocessing import MinMaxScaler
sc_x = MinMaxScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)

In [None]:
classifier = tf.keras.models.Sequential()
classifier.add(tf.keras.layers.Dense(units = 21, activation = 'relu', input_shape = (41,)))
classifier.add(tf.keras.layers.Dropout(0.2))
classifier.add(tf.keras.layers.Dense(units = 21, activation = 'relu'))
classifier.add(tf.keras.layers.Dropout(0.2))
classifier.add(tf.keras.layers.Dense(units = 1, activation = 'sigmoid'))
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = [tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
classifier.fit(x_train, y_train, batch_size = 10, epochs = 10)

In [None]:
def get_results1(classifier):
    pred_prob1=classifier.predict(x_test).flatten()
    y_pred = (pred_prob1 > 0.5)
    fpr, tpr, thresh = roc_curve(y_test, pred_prob1, pos_label=1)
    cm=confusion_matrix(y_test,y_pred)
    plt.figure(figsize=(8,5))
    sns.heatmap(cm,annot=True,fmt="d")
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    print(classification_report(y_test, y_pred))
    auc = round(roc_auc_score(y_test, pred_prob1),2)
    print('AUC Score is - ',auc)
    f1 = round(f1_score(y_test, y_pred, average='macro'),2) 
    return (fpr, tpr, thresh, f1)

In [None]:
fpr6, tpr6, thresh6, f6 = get_results1(classifier)

In [None]:
def model_per():
    models_names = ['Logistic Regression','Decision Tree','Random Forest','Gradient Boosting','XGBoost','ANN']
    f1_scores = np.array([f1,f2,f3,f4,f5,f6])
    indices = np.argsort(f1_scores)

    num_models = 6 

    plt.figure(figsize=(14,8))
    plt.title('Model Comparision')

    # only plot the customized number of features
    plt.barh(range(num_models), f1_scores[indices[-num_models:]], color='b', align='center')
    plt.yticks(range(num_models), [models_names[i] for i in indices[-num_models:]])
    plt.xlabel('F1-Score')
    plt.xlim((0,1))
    k=0
    for j in [f1_scores[i] for i in indices[-num_models:]]:
        plt.annotate(j,xy=(0.05,k))
        k+=1
    plt.show()

In [None]:
model_per()

### Balancing the target variable

In [None]:
x = df_processed.values
y = df_orig.Risk_Flag.values

In [None]:
over = SMOTE(sampling_strategy=0.3)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

In [None]:
x,y = pipeline.fit_resample(x,y)

In [None]:
plt.figure(figsize=(10,6))
plt.xticks(fontsize=14)
plt.xticks(fontsize=14)
plt.xlabel('Risk_Flag',fontsize=16)
plt.ylabel('Count',fontsize=16)
ax=sns.countplot(x=y, data=df_orig)
for p in ax.patches:
    ax.annotate(f'\n{p.get_height()}', (p.get_x()+0.38, p.get_height()), ha='center', va='top', color='white', size=18)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [None]:
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier=LogisticRegression(random_state = 0)
classifier.fit(x_train,y_train)

In [None]:
fpr1, tpr1, thresh1,f1 = get_results(classifier)

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(x_train,y_train.flatten())

In [None]:
fpr2, tpr2, thresh2, f2 = get_results(classifier)

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100, random_state = 0)
classifier.fit(x_train,y_train.flatten())

In [None]:
fpr3, tpr3, thresh3, f3 = get_results(classifier)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
classifier = GradientBoostingClassifier(learning_rate=0.1)
classifier = classifier.fit(x_train,y_train.flatten())

In [None]:
fpr4, tpr4, thresh4, f4 = get_results(classifier)

In [None]:
import xgboost as xgb
classifier = xgb.XGBClassifier(learning_rate = 0.1)
classifier = classifier.fit(x_train,y_train.flatten())

In [None]:
fpr5, tpr5, thresh5, f5 = get_results(classifier)

In [None]:
from sklearn.preprocessing import MinMaxScaler
sc_x = MinMaxScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)

In [None]:
classifier = tf.keras.models.Sequential()
classifier.add(tf.keras.layers.Dense(units = 21, activation = 'relu', input_shape = (41,)))
classifier.add(tf.keras.layers.Dropout(0.2))
classifier.add(tf.keras.layers.Dense(units = 21, activation = 'relu'))
classifier.add(tf.keras.layers.Dropout(0.2))
classifier.add(tf.keras.layers.Dense(units = 1, activation = 'sigmoid'))
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = [tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
classifier.fit(x_train, y_train, batch_size = 10, epochs = 10)

In [None]:
fpr6, tpr6, thresh6, f6 = get_results1(classifier)

In [None]:
model_per()

In [None]:
def plot_roc():
    random_probs = [0 for i in range(len(y_test))]
    p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)

    # plot roc curves
    plt.figure(figsize=(10,8))
    plt.plot(fpr1, tpr1, linestyle='--',color='green', label='F1-'+str(f1)+', Logistic Regression')
    plt.plot(fpr2, tpr2, linestyle='--',color='teal', label='F1-'+str(f2)+', Decision Tree')
    plt.plot(fpr3, tpr3, linestyle='--',color='orange', label='F1-'+str(f3)+', Random Forest')
    plt.plot(fpr4, tpr4, linestyle='--',color='blue', label='F1-'+str(f4)+', Gradient Boosting')
    plt.plot(fpr5, tpr5, linestyle='--',color='red', label='F1-'+str(f5)+', XGBoost')
    plt.plot(fpr6, tpr6, linestyle='--',color='yellow', label='F1-'+str(f6)+', ANN')

    plt.plot(p_fpr, p_tpr, linestyle='--', color='red')
    # title
    plt.title('ROC curve')
    # x label
    plt.xlabel('False Positive Rate')
    # y label
    plt.ylabel('True Positive rate')

    plt.legend(loc='best')

In [None]:
plot_roc()

In [None]:
def imp_features(classifier,top=10):
    features = df_processed.columns
    importances = classifier.feature_importances_
    indices = np.argsort(importances)

    # customized number 
    num_features = top 

    plt.figure(figsize=(10,10))
    plt.title('Feature Importances')

    # only plot the customized number of features
    plt.barh(range(num_features), importances[indices[-num_features:]], color='b', align='center')
    plt.yticks(range(num_features), [features[i] for i in indices[-num_features:]])
    plt.xlabel('Relative Importance')
    plt.show()

In [None]:
imp_features(classifier)

#### Here we can see that numerical features have more importance and all other features are very insigificant to that

In [None]:
df_processed = pd.concat([df_refined[numerical_vars]], axis = 1)
df_processed
x = df_processed.values
y = df_orig.Risk_Flag.values

In [None]:
over = SMOTE(sampling_strategy=0.3)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

In [None]:
x,y = pipeline.fit_resample(x,y)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100, random_state = 0)
classifier.fit(x_train,y_train.flatten())

In [None]:
fpr7, tpr7, thresh7, f7 = get_results1(classifier)

In [None]:
def model_per():
    models_names = ['Logistic Regression','Decision Tree','Random Forest','Gradient Boosting','XGBoost','ANN','Random Forest(Numaric)']
    f1_scores = np.array([f1,f2,f3,f4,f5,f6,f7])
    indices = np.argsort(f1_scores)

    num_models = 7 

    plt.figure(figsize=(14,8))
    plt.title('Model Comparision')

    # only plot the customized number of features
    plt.barh(range(num_models), f1_scores[indices[-num_models:]], color='b', align='center')
    plt.yticks(range(num_models), [models_names[i] for i in indices[-num_models:]])
    plt.xlabel('F1-Score')
    plt.xlim((0,1))
    k=0
    for j in [f1_scores[i] for i in indices[-num_models:]]:
        plt.annotate(j,xy=(0.05,k))
        k+=1
    plt.show()

In [None]:
model_per()