## Importing all required modules

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy

## Replacing the NULL value as specified

In [21]:
data = pd.read_csv("Data_Science_Internship - Dump.csv")
data.replace('9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6c9bc9d493a23be9de0', value=pd.NA)
data.columns

Index(['Unnamed: 0', 'Agent_id', 'status', 'lost_reason', 'budget', 'lease',
       'movein', 'source', 'source_city', 'source_country', 'utm_source',
       'utm_medium', 'des_city', 'des_country', 'room_type', 'lead_id'],
      dtype='object')

## Only taking LOST or WON rows

In [22]:
data=data[(data.status=='LOST') | (data.status=='WON') ]
data['status'].unique()

array(['LOST', 'WON'], dtype=object)

## Separating out the features and output

In [70]:
features=['Agent_id','budget','lease', 'movein', 'source', 'source_city', 'source_country','utm_source','utm_medium', 'des_city', 'des_country', 'room_type']
X=data[features]
y=data['status']

## Replacing null values with the modes of each column

In [24]:
X.isnull().sum()

Agent_id              0
budget             3694
lease              2336
movein            13610
source                0
source_city           0
source_country        0
utm_source            0
utm_medium            0
des_city              0
des_country           0
room_type         23491
dtype: int64

In [25]:
mode_list={}
for x in X:
    modeval=X[x].mode(dropna=True)[0]
    mode_list[x]=modeval
    X[x].fillna(modeval, inplace=True)
X.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[x].fillna(modeval, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[x].fillna(modeval, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[x].fillna(modeval, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[x].fillna(modeval, inplace=True)
A value is trying to be set 

Agent_id          0
budget            0
lease             0
movein            0
source            0
source_city       0
source_country    0
utm_source        0
utm_medium        0
des_city          0
des_country       0
room_type         0
dtype: int64

## Replacing categorical values with discrete numerical values

In [26]:
values={}
for col in X:
    values[col]=X[col].unique()
    X[col].replace(values[col],[x for x in range(X[col].nunique())], inplace=True)
X.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col].replace(values[col],[x for x in range(X[col].nunique())], inplace=True)


Unnamed: 0,Agent_id,budget,lease,movein,source,source_city,source_country,utm_source,utm_medium,des_city,des_country,room_type
0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,1,0,0
2,2,1,0,1,1,1,1,1,1,2,0,0
3,3,2,1,0,2,2,1,1,1,3,0,0
4,0,0,0,0,0,0,0,0,0,4,1,0


In [27]:
y_values=y.unique()
y.replace(y_values, [x for x in range(y.nunique())], inplace=True)
y_values

array(['LOST', 'WON'], dtype=object)

## Splitting the data into training and testing data with 70:30 ratio and feeding into a Random Forest Classifier

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=5, shuffle=True)

In [29]:
model=RandomForestClassifier(max_depth=6, random_state=0, class_weight="balanced").fit(X_train, y_train)

## Scores for all the predictions

In [30]:
predicted = model.predict_proba(X_test)
predicted = [x[1] for x in predicted]
predicted[:5]

[0.512497061271729,
 0.4240547717633995,
 0.5353738034747324,
 0.5279474740950206,
 0.5333567993066562]

In [31]:
predicted = model.predict(X_test)
accuracy_score(y_test,predicted)

0.6411197466896948

In [32]:
precision_score(y_test, predicted)

0.12831369661266567

In [33]:
recall_score(y_test, predicted)

0.7344573234984194

In [34]:
f1_score(y_test, predicted)

0.21846105626077417

## Function to predict for any given value

In [93]:
def get_score(params:pd.Series):
    params=params[features]
    for x in params.keys():
        if(type(params[x])!=str):
            params[x]=mode_list[x]
        idx = numpy.where(values[x]==params[x])[0][0]
        params[x]=idx
    return model.predict_proba([params])[0][1]*100

input_features=data.iloc[0]
print("Chances of winning the lead: ", get_score(input_features))

Chances of winning the lead:  6.232897417128237


