# Predict A Doctor's Consultation Fee 

We have all been in situation where we go to a doctor in emergency and find that the consultation fees are too high. As a data scientist we all should do better. What if you have data that records important details about a doctor and you get to build a model to predict the doctor’s consulting fee.? This is the hackathon that lets you do that.



Size of training set: 5961 records


Size of test set: 1987 records



FEATURES:


Qualification: Qualification and degrees held by the doctor


Experience: Experience of the doctor in number of years


Rating: Rating given by patients


Profile: Type of the doctor


Miscellaeous_Info: Extra information about the doctor


Fees: Fees charged by the doctor


Place: Area and the city where the doctor is located.

In [None]:
# Numpy and Pandas library import for analysis of the dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline
import warnings
warnings.simplefilter('ignore')

In [None]:
# importing the datset

Final_Test = pd.read_excel('Final_Test.xlsx')
Final_Train = pd.read_excel('Final_Train.xlsx')
Sample_submission = pd.read_excel('Sample_submission.xlsx')

In [None]:
# Shape for each Dataframe

print("Final_Train:", Final_Train.shape)
print("Final_Test:", Final_Test.shape)
print("Sample_submission:", Sample_submission.shape)

In [None]:
# Initial 5 rows of data for Final_Train

Final_Train.head()

In [None]:
# Initial 5 rows of data for Final_Test

Final_Test.head()

In [None]:
Sample_submission.head()

In [None]:
# Store the columns into list

list_Final_Test=list(Final_Test.columns)
list_Final_Train=list(Final_Train.columns)
list_Sample_submission=list(Sample_submission.columns)

# ** General Information **

In [None]:
Final_Train.info()

In [None]:
Final_Test.info()

# ****************** Checking Null Values ******************

In [None]:
Final_Test.isnull().sum()

We have null values in the columns: Rating, Place, Miscellaneous_Info

In [None]:
Final_Train.isnull().sum()

We have null values in the columns: Rating, Place, Miscellaneous_Info

# ** Unique values check **

In [None]:
print('Final_Train\n')
print('Qualification:', Final_Train['Qualification'].nunique())
print('Experience:', Final_Train['Experience'].nunique())
print('Rating:', Final_Train['Rating'].nunique())
print('Place:', Final_Train['Place'].nunique())
print('Profile', Final_Train['Profile'].nunique())

In [None]:
print('Final_Test\n')
print('Qualification:', Final_Test['Qualification'].nunique())
print('Experience:', Final_Test['Experience'].nunique())
print('Rating:', Final_Test['Rating'].nunique())
print('Place:', Final_Test['Place'].nunique())
print('Profile', Final_Test['Profile'].nunique())

In [None]:
# Removing the 'years experiece' part from the Experience column in both Final_Train and Final_Test dataframe.
# We will keep only integer value and convert the column data type to int
# Since Experience column does not have null values, hence we will keep this column as is for now.

Final_Train['Experience'] = Final_Train['Experience'].str.replace('years experience','').astype(int)
Final_Train.head(2)

In [None]:
Final_Test['Experience'] = Final_Test['Experience'].str.replace('years experience','').astype(int)
Final_Test.head(2)

In [None]:
# We have replaced the mising values with -99% to differentiate them.
# We will omit the % sign in rating column

Final_Train['Rating'].fillna('0%', inplace=True)
Final_Train['Rating'] = Final_Train['Rating'].str[:-1].astype(int)
Final_Train.head(2)

In [None]:
# We have replaced the mising values with -99% to differentiate them.
# We will omit the % sign in rating column

Final_Test['Rating'].fillna('0%', inplace=True)
Final_Test['Rating'] = Final_Test['Rating'].str[:-1].astype(int)
Final_Test.head(2)

In [None]:
# Fliing the Null values in Place column with unknown, unknown
# The place column has 2 information in it. Lets create other 2 column to contain them separately for better
# analysis

Final_Train['Place'].fillna('unknown, unknown', inplace=True)
Final_Test['Place'].fillna('unknown, unknown', inplace=True)

In [None]:
# Spliting the data based on comma (',')

Final_Train['Place'] = Final_Train['Place'].str.split(',')
Final_Train['City'] = Final_Train['Place'].str[-1]
Final_Train['Place'] = Final_Train['Place'].str[0]

Final_Test['Place'] = Final_Test['Place'].str.split(',')
Final_Test['City'] = Final_Test['Place'].str[-1]
Final_Test['Place'] = Final_Test['Place'].str[0]

In [None]:
Final_Train.head(2)

In [None]:
Final_Test.head(2)

In [None]:
Final_Train['Qualification'].value_counts()

In [None]:
Final_Test['Qualification'].value_counts()

In [None]:
Final_Train['Qualification'].value_counts(normalize=True)[:20].plot(kind='barh')
plt.show()

In [None]:
Final_Train['Qualification'] = Final_Train['Qualification'].str.split(',')
Final_Train['Qualification'] = Final_Train['Qualification'].str[0]

Final_Test['Qualification'] = Final_Test['Qualification'].str.split(',')
Final_Test['Qualification'] = Final_Test['Qualification'].str[0]

In [None]:
Final_Train['Qualification'].unique()

In [None]:
Final_Test['Qualification'].unique()

In [None]:
Final_Train.head(2)

In [None]:
Final_Test.head(2)

In [None]:
Final_Train['Profile'].unique()

In [None]:
# convert the Profile column into categorical type

Final_Train.Profile = pd.Categorical(Final_Train.Profile)
Final_Test.Profile = pd.Categorical(Final_Test.Profile)

# create a new column region code
Final_Train['Profile Code'] = Final_Train.Profile.cat.codes
Final_Test['Profile Code'] = Final_Test.Profile.cat.codes

# Drop the profile column from both the test and train dataframe

Final_Train.drop(['Profile'],axis=1,inplace=True)
Final_Test.drop(['Profile'],axis=1,inplace=True)

In [None]:
Final_Train.head(2)

In [None]:
Final_Test.head(2)

In [None]:
# convert the City column into categorical type

Final_Train.City = pd.Categorical(Final_Train.City)
Final_Test.City = pd.Categorical(Final_Test.City)

# create a new column region code
Final_Train['city'] = Final_Train.City.cat.codes
Final_Test['city'] = Final_Test.City.cat.codes

# Drop the profile column from both the test and train dataframe

Final_Train.drop(['City'],axis=1,inplace=True)
Final_Test.drop(['City'],axis=1,inplace=True)

In [None]:
Final_Train.head(2)

In [None]:
Final_Test.head(2)

In [None]:
# convert the Place column into categorical type

Final_Train.Place = pd.Categorical(Final_Train.Place)
Final_Test.Place = pd.Categorical(Final_Test.Place)

# create a new column place
Final_Train['place'] = Final_Train.Place.cat.codes
Final_Test['place'] = Final_Test.Place.cat.codes

# Drop the Place column from both the test and train dataframe

Final_Train.drop(['Place'],axis=1,inplace=True)
Final_Test.drop(['Place'],axis=1,inplace=True)

In [None]:
Final_Train.head(2)

In [None]:
Final_Test.head(2)

In [None]:
# convert the Qualification column into categorical type

Final_Train.Qualification = pd.Categorical(Final_Train.Qualification)
Final_Test.Qualification = pd.Categorical(Final_Test.Qualification)

# create a new column place
Final_Train['qualification'] = Final_Train.Qualification.cat.codes
Final_Test['qualification'] = Final_Test.Qualification.cat.codes

# Drop the Qualification column from both the test and train dataframe

Final_Train.drop(['Qualification'],axis=1,inplace=True)
Final_Test.drop(['Qualification'],axis=1,inplace=True)

In [None]:
Final_Train.head(2)

In [None]:
Final_Test.head(2)

In [None]:
print("Miscellaneous_Info misisng values in Final_Train = ",Final_Train['Miscellaneous_Info'].isnull().sum())
print(Final_Train['Miscellaneous_Info'].shape)
print("Miscellaneous_Info misisng values in Final_Test = ",Final_Test['Miscellaneous_Info'].isnull().sum())
print(Final_Test['Miscellaneous_Info'].shape)

Since around 50% of data is missing in Miscellaneous_Info column, at this stage we will drop this column.

In [None]:
Final_Train.drop(['Miscellaneous_Info'],axis=1,inplace=True)
Final_Test.drop(['Miscellaneous_Info'],axis=1,inplace=True)

In [None]:
Final_Train.head(2)

In [None]:
Final_Test.head(2)

In [None]:
plt.figure(figsize=(15,7))
sns.heatmap(Final_Train.corr(), annot=True, linewidths=0.5, linecolor='black', fmt='.2f')
plt.show()

In [None]:
sns.distplot(Final_Train['Fees'])
plt.show()

In [None]:
sns.catplot(x="Rating", y="Fees", data=Final_Train)
plt.xticks(rotation="90")
plt.show()

In [None]:
sns.catplot(x="Experience", y="Fees", data=Final_Train)
plt.xticks(rotation="90")
plt.show()

In [None]:
sns.catplot(x="Profile Code", y="Fees", data=Final_Train)
plt.xticks(rotation="90")
plt.show()

In [None]:
sns.catplot(x="city", y="Fees", data=Final_Train)
plt.xticks(rotation="90")
plt.show()

Outlier checking

In [None]:
from scipy.stats import zscore
import numpy as np
z=np.abs(zscore(Final_Train))
threshold=3
np.where(z>3)

In [None]:
final_train = Final_Train[(z<3).all(axis=1)]
print(final_train.shape)
final_train.head()

spliting Final_Train into x and y

In [None]:
x=final_train.drop(['Fees'],axis=1)
y=pd.DataFrame(final_train['Fees'])

In [None]:
x.head()

In [None]:
y.head()

In [None]:
# #Finding the best random state
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

# #Finding the best random state
# best_rstate=0
# accu=0

# for i in range(30,200):
#     x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.25,random_state=i)
#     mod=LinearRegression()
#     mod.fit(x_train,y_train)
#     y_pred = mod.predict(x_test)
#     tempaccu=r2_score(y_test,y_pred)
#     if tempaccu > accu:
#         accu = tempaccu
#         best_rstate = i
        
# print(f"Best Accuracy {accu*100} found on Random state {best_rstate}")

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.20, random_state=1)

LinearRegression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

LR = LinearRegression()
LR.fit(x_train,y_train)
y_pred = LR.predict(x_test)

r2score = r2_score(y_test,y_pred)
cvscore = cross_val_score(LinearRegression(), x_train, y_train, cv=5).mean()

print(f"Accuracy= {r2score*100}, cross_val_score= {cvscore*100} & difference= {(r2score*100)-(cvscore*100)}")

RandomForest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
RF = RandomForestRegressor()
RF.fit(x_train,y_train)
y_pred = RF.predict(x_test)

r2score = r2_score(y_test,y_pred)
cvscore = cross_val_score(RandomForestRegressor(), x_train, y_train, cv=5).mean()

print(f"Accuracy= {r2score*100}, cross_val_score= {cvscore*100} & difference= {(r2score*100)-(cvscore*100)}")

Adaboost Regressor

In [None]:
from sklearn.ensemble import AdaBoostRegressor
ADB=AdaBoostRegressor()
ADB.fit(x_train,y_train)
y_pred = ADB.predict(x_test)

r2score = r2_score(y_test,y_pred)
cvscore = cross_val_score(AdaBoostRegressor(), x_train, y_train, cv=5).mean()

print(f"Accuracy= {r2score*100}, cross_val_score= {cvscore*100} & difference= {(r2score*100)-(cvscore*100)}")

I received a very poor accuracy with this model.

In [None]:
# creating parameter list to pass in GridSearchCV

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

parameters = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
from sklearn.model_selection import GridSearchCV

GCV = GridSearchCV(RandomForestRegressor(),parameters,cv=5,scoring='r2') # initializing GridsearchCV
GCV.fit(x_train,y_train)
GCV.best_estimator_   # Finding best estimators
GCV_pred = GCV.best_estimator_.predict(x_test) #predicting the values using best estimators found by gridserchcv
print("Final Accuracy: ",r2_score(y_test,GCV_pred)*100) # final accuracy