# Loading Required Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

# Let's Check out the data set

In [61]:
df = pd.read_csv("Win_Prediction_Data.csv")

In [62]:
df.head()

Unnamed: 0,Client_Category,Solution_Type,Deal_Date,Sector,Location,VP_Name,Manager_Name,Deal_Cost,Deal_Status
0,Telecom,Solution 7,2012-03-27,Sector 24,L5,Ekta Zutshi,Gopa Trilochana,150000.0,Won
1,Telecom,Solution 7,2012-09-25,Sector 24,L5,Ekta Zutshi,Gopa Trilochana,744705.88,Won
2,Internal,Solution 59,2011-08-01,Sector 20,Others,Ekta Zutshi,Russell Dahlen,60000.0,Lost
3,Internal,Solution 59,2011-04-28,Sector 20,Others,Ekta Zutshi,Russell Dahlen,60000.0,Lost
4,Internal,Solution 32,2011-06-03,Sector 20,Others,Ekta Zutshi,Russell Dahlen,80882.35,Lost


In [63]:
#Dimension of dataset

df.shape

(10061, 9)

In [64]:
#checking null values

df.isnull().sum()

Client_Category    79
Solution_Type       0
Deal_Date           0
Sector              0
Location            0
VP_Name             0
Manager_Name        0
Deal_Cost           0
Deal_Status         0
dtype: int64

In [65]:
#Removing Duplicates Values 

df.drop_duplicates(subset=None, keep='first', inplace=True)

In [66]:
df.shape

(10047, 9)

In [67]:
#Filling null values of client category with Others

df["Client_Category"].fillna("Others",inplace = True)

In [68]:
df.isnull().sum()

Client_Category    0
Solution_Type      0
Deal_Date          0
Sector             0
Location           0
VP_Name            0
Manager_Name       0
Deal_Cost          0
Deal_Status        0
dtype: int64

In [69]:
df["Deal_Cost"].describe()

count    1.004700e+04
mean     7.672963e+05
std      1.620881e+06
min      0.000000e+00
25%      1.891176e+05
50%      3.823529e+05
75%      7.647059e+05
max      3.676471e+07
Name: Deal_Cost, dtype: float64

In [70]:
#Replacing 0 value of deal cost with nan

#df = df.replace(0,np.nan)

In [71]:
#filling zero deal cost values with median deal cost of all lost deals

def Deal_Cost_Median(Deal_Cost):
    if Deal_Cost == 0:
        return 441176.47
    else:
        return Deal_Cost

In [72]:
df["Deal_Cost"] = df["Deal_Cost"].apply(Deal_Cost_Median)

In [73]:
df["Deal_Cost"].describe()

count    1.004700e+04
mean     7.780546e+05
std      1.617212e+06
min      2.941200e+02
25%      2.058824e+05
50%      4.117647e+05
75%      7.647059e+05
max      3.676471e+07
Name: Deal_Cost, dtype: float64

In [74]:
#Final data set

df.head()

Unnamed: 0,Client_Category,Solution_Type,Deal_Date,Sector,Location,VP_Name,Manager_Name,Deal_Cost,Deal_Status
0,Telecom,Solution 7,2012-03-27,Sector 24,L5,Ekta Zutshi,Gopa Trilochana,150000.0,Won
1,Telecom,Solution 7,2012-09-25,Sector 24,L5,Ekta Zutshi,Gopa Trilochana,744705.88,Won
2,Internal,Solution 59,2011-08-01,Sector 20,Others,Ekta Zutshi,Russell Dahlen,60000.0,Lost
3,Internal,Solution 59,2011-04-28,Sector 20,Others,Ekta Zutshi,Russell Dahlen,60000.0,Lost
4,Internal,Solution 32,2011-06-03,Sector 20,Others,Ekta Zutshi,Russell Dahlen,80882.35,Lost


In [75]:
def Deal_status_Code(Deal_Status):
    if Deal_Status == "Won":
        return 1
    else:
        return 0

In [76]:
df["Deal_Status"] = df["Deal_Status"].apply(Deal_status_Code)

In [77]:
df.head()

Unnamed: 0,Client_Category,Solution_Type,Deal_Date,Sector,Location,VP_Name,Manager_Name,Deal_Cost,Deal_Status
0,Telecom,Solution 7,2012-03-27,Sector 24,L5,Ekta Zutshi,Gopa Trilochana,150000.0,1
1,Telecom,Solution 7,2012-09-25,Sector 24,L5,Ekta Zutshi,Gopa Trilochana,744705.88,1
2,Internal,Solution 59,2011-08-01,Sector 20,Others,Ekta Zutshi,Russell Dahlen,60000.0,0
3,Internal,Solution 59,2011-04-28,Sector 20,Others,Ekta Zutshi,Russell Dahlen,60000.0,0
4,Internal,Solution 32,2011-06-03,Sector 20,Others,Ekta Zutshi,Russell Dahlen,80882.35,0


In [78]:
#Removing Internal Client Category from dataset

df = df[df["Client_Category"]!="Internal"]

In [79]:
df

Unnamed: 0,Client_Category,Solution_Type,Deal_Date,Sector,Location,VP_Name,Manager_Name,Deal_Cost,Deal_Status
0,Telecom,Solution 7,2012-03-27,Sector 24,L5,Ekta Zutshi,Gopa Trilochana,150000.00,1
1,Telecom,Solution 7,2012-09-25,Sector 24,L5,Ekta Zutshi,Gopa Trilochana,744705.88,1
7,Govt,Solution 7,2012-09-17,Sector 13,L5,Sargar Deep Rao,Vidur Hukle,409705.88,0
8,Consumer Good,Solution 42,2012-04-11,Sector 12,L10,Lilli Storrs,Md. Daud,1032352.94,1
10,International Bank,Solution 6,2012-02-11,Sector 2,L10,Long Bergstrom,Luv Malhotra,316176.47,1
...,...,...,...,...,...,...,...,...,...
10055,Telecom,Solution 9,2019-05-23,Sector 24,L10,Gopa Trilochana,Darla Dickinson,823529.41,0
10056,Power ind,Solution 9,2019-03-18,Sector 9,L5,Rudraksh Sharma,Rudraksh Sharma,588235.29,0
10058,Power ind,Solution 9,2019-02-28,Sector 9,L5,Rudraksh Sharma,Rudraksh Sharma,588235.29,0
10059,Power ind,Solution 62,2019-01-27,Sector 9,L5,Man Suddeth,Cleotilde Biron,3042058.82,1


In [80]:
df.to_csv('win_final.csv', index = False)

# Encoding

### Performing Target Guided Ordinal Encoding

In [81]:
col = ['Client_Category', 'Solution_Type', 'Sector', 'Location',
       'VP_Name', 'Manager_Name']

In [82]:
for feature in col:
    ordinal_labels_1 = df.groupby(feature)['Deal_Status'].mean().sort_values().index
    ordinal_labels_2={k:i for i,k in enumerate(ordinal_labels_1,0)}
    df[feature]=df[feature].map(ordinal_labels_2)

df.head(10)    

Unnamed: 0,Client_Category,Solution_Type,Deal_Date,Sector,Location,VP_Name,Manager_Name,Deal_Cost,Deal_Status
0,29,31,2012-03-27,17,11,41,159,150000.0,1
1,29,31,2012-09-25,17,11,41,159,744705.88,1
7,16,31,2012-09-17,1,11,23,110,409705.88,0
8,36,41,2012-04-11,7,6,35,175,1032352.94,1
10,33,23,2012-02-11,11,6,36,116,316176.47,1
11,8,27,2012-03-30,11,10,17,71,929411.77,1
12,20,27,2012-04-26,11,11,17,121,2367647.06,1
13,19,27,2011-12-20,19,11,41,38,8823529.41,0
14,20,51,2011-12-22,21,11,36,166,122058.82,0
15,23,49,2012-01-23,18,12,41,91,529411.77,0


In [83]:
#Adding Deal Year Column to data set
df['Deal_Year'] = pd.DatetimeIndex(df['Deal_Date']).year

#Adding Deal Month Column to data set
df['Deal_Month'] = pd.DatetimeIndex(df['Deal_Date']).month

#Dropping Deal_Date Column
df.drop("Deal_Date", axis = 1, inplace = True);df.head()

Unnamed: 0,Client_Category,Solution_Type,Sector,Location,VP_Name,Manager_Name,Deal_Cost,Deal_Status,Deal_Year,Deal_Month
0,29,31,17,11,41,159,150000.0,1,2012,3
1,29,31,17,11,41,159,744705.88,1,2012,9
7,16,31,1,11,23,110,409705.88,0,2012,9
8,36,41,7,6,35,175,1032352.94,1,2012,4
10,33,23,11,6,36,116,316176.47,1,2012,2


In [84]:
data = df.copy()

data.drop("Deal_Year", axis = 1, inplace = True)
data.drop("Deal_Month", axis = 1, inplace = True)

In [85]:
data.head()

Unnamed: 0,Client_Category,Solution_Type,Sector,Location,VP_Name,Manager_Name,Deal_Cost,Deal_Status
0,29,31,17,11,41,159,150000.0,1
1,29,31,17,11,41,159,744705.88,1
7,16,31,1,11,23,110,409705.88,0
8,36,41,7,6,35,175,1032352.94,1
10,33,23,11,6,36,116,316176.47,1


In [86]:
#Independent and Dependent Feature

X = data.drop("Deal_Status", axis = 1)
Y = data["Deal_Status"]

In [87]:
# Train Test Split

from sklearn.model_selection import train_test_split

In [88]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=102)

# Models

## Logistic Regression

In [89]:
from sklearn.linear_model import LogisticRegression

In [90]:
#intializing LogisticRegression to object logmodel

logmodel = LogisticRegression()

In [91]:
##Hyperparameter Optimization

from sklearn.model_selection import RandomizedSearchCV

# Create params

params = {'penalty' : ['l1', 'l2'], 
              'C' : np.logspace(-4, 4, 20),
              'solver' : ['liblinear','sag','saga']}

In [92]:
log_randomcv = RandomizedSearchCV(logmodel,param_distributions=params,n_iter=5,scoring='roc_auc',n_jobs=-1,cv=5,verbose=3)

In [93]:
log_randomcv.fit(X_train,y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   34.7s finished


RandomizedSearchCV(cv=5, estimator=LogisticRegression(), n_iter=5, n_jobs=-1,
                   param_distributions={'C': array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
                                        'penalty': ['l1', 'l2'],
                                        'solver': ['liblinear', 'sag', 'saga']},
                   scoring='roc_auc', verbose=3)

In [94]:
lg_model = log_randomcv.best_estimator_

In [95]:
#prediction
lg_prediction = lg_model.predict(X_test)

### Evaluation Metrics

In [96]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [97]:
print(confusion_matrix(y_test,lg_prediction))
print(accuracy_score(y_test,lg_prediction))
print(classification_report(y_test,lg_prediction))

[[969 151]
 [359 241]]
0.7034883720930233
              precision    recall  f1-score   support

           0       0.73      0.87      0.79      1120
           1       0.61      0.40      0.49       600

    accuracy                           0.70      1720
   macro avg       0.67      0.63      0.64      1720
weighted avg       0.69      0.70      0.68      1720



In [98]:
lg_model.score(X_train,y_train)

0.7108784176847004

## Xgboost

In [99]:
import xgboost
classifier=xgboost.XGBClassifier()

In [100]:
params={
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
    
}

In [101]:
xg_randomcv = RandomizedSearchCV(classifier,param_distributions=params,n_iter=5,scoring='roc_auc',n_jobs=-1,cv=5,verbose=3)

In [102]:
xg_randomcv.fit(X_train,y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   11.3s finished




RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100,...
                                           reg_lambda=None,
                                           scale_pos_weight=None,
                                           subsample=No

In [103]:
#best model from random search cv
xg_model = xg_randomcv.best_estimator_

In [104]:
#making prediction
xg_predictions = xg_model.predict(X_test)

In [105]:
xg_predictions

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

### Evaluation Metrics

In [106]:
print(confusion_matrix(y_test,xg_predictions))
print(accuracy_score(y_test,xg_predictions))
print(classification_report(y_test,xg_predictions))

[[1013  107]
 [ 260  340]]
0.7866279069767442
              precision    recall  f1-score   support

           0       0.80      0.90      0.85      1120
           1       0.76      0.57      0.65       600

    accuracy                           0.79      1720
   macro avg       0.78      0.74      0.75      1720
weighted avg       0.78      0.79      0.78      1720



In [107]:
xg_model.score(X_train,y_train)

0.8888888888888888

In [70]:
report = pd.DataFrame(y_test)

In [77]:
report["Prediction"]= xg_predictions

In [82]:
report["Deal Cost"] = X_test["Deal_Cost"]

In [84]:
report.to_csv("report1.csv")