***Decision Tree Regression***

# **Data Preprocessing**

In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Datasets For Machine Learning/Salary Data (1).csv')
df.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375 entries, 0 to 374
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  373 non-null    float64
 1   Gender               373 non-null    object 
 2   Education Level      373 non-null    object 
 3   Job Title            373 non-null    object 
 4   Years of Experience  373 non-null    float64
 5   Salary               373 non-null    float64
dtypes: float64(3), object(3)
memory usage: 17.7+ KB


In [30]:
df.isnull().sum() #finding null values

Unnamed: 0,0
Age,2
Gender,2
Education Level,2
Job Title,2
Years of Experience,2
Salary,2


In [31]:
df['Age'] = df['Age'].fillna(df['Age'].mean())  #Handling null values
df['Gender'] = df['Gender'].fillna(df['Gender'].mode().iloc[0])
df['Education Level'] = df['Education Level'].fillna(df['Education Level'].mode().iloc[0])
df['Job Title'] = df['Job Title'].fillna(df['Job Title'].mode().iloc[0])
df['Years of Experience'] = df['Years of Experience'].fillna(df['Years of Experience'].mean())
df['Salary'] = df['Salary'].fillna(df['Salary'].mean())

In [32]:
df.isnull().sum()

Unnamed: 0,0
Age,0
Gender,0
Education Level,0
Job Title,0
Years of Experience,0
Salary,0


In [33]:
#Encoding categorical data
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [34]:
df['Gender'] = le.fit_transform(df['Gender']) # converting the categorical data into numerical
df['Education Level'] = le.fit_transform(df['Education Level'])
df['Job Title'] = le.fit_transform(df['Job Title'])

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375 entries, 0 to 374
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  375 non-null    float64
 1   Gender               375 non-null    int64  
 2   Education Level      375 non-null    int64  
 3   Job Title            375 non-null    int64  
 4   Years of Experience  375 non-null    float64
 5   Salary               375 non-null    float64
dtypes: float64(3), int64(3)
memory usage: 17.7 KB


In [36]:
#Handling outliers
z_score =(df['Age'] - df["Age"].mean())/df['Age'].std()
for i in z_score:
  if i<-3 or i>3:
    print("You have outlier for Age",i)
    o_val = i
  else:
    continue

z_score =(df['Years of Experience'] - df["Years of Experience"].mean())/df['Years of Experience'].std()
for i in z_score:
  if i<-3 or i>3:
    print("You have outlier for YoE",i)
    o_val = i
  else:
    continue

In [37]:
#Handling duplicates
#duplicates = df[df.duplicated(keep="first")]
#duplicates

In [38]:
#df = df.drop_duplicates()

In [39]:
#duplicates = df[df.duplicated(keep="first")]
#duplicates

In [40]:
import matplotlib.pyplot as plt #for plotting
import seaborn as sns

In [41]:
#sns.set(font_scale=2)
#plt.subplots(figsize=(20,20))
#heat_plot=sns.heatmap(df.corr(method='pearson'),annot=True,cmap='RdYlGn',annot_kws={'size':20}) #Pearson correlation / heatmap

#plt.yticks(fontsize =35) # font size of y axis
#plt.xticks(fontsize =35) # font size of x axis

#plt.show()

In [42]:
#Finding exact correlation values by descending order
correlations =df.corr(method='pearson')['Salary'].sort_values(ascending=False)
correlations

Unnamed: 0,Salary
Salary,1.0
Years of Experience,0.930338
Age,0.922335
Education Level,0.669389
Job Title,0.135585
Gender,0.070931


We can opt out Gender

In [43]:
#df.dropna(subset=['Gender'],inplace = True)

In [44]:
# Shuffling the dataset : Use it only when any row is dropped
#from sklearn.utils import shuffle
#df = shuffle(df)

In [45]:
#re-arranging the index values
#df = df.reset_index(drop=True)
#df

In [46]:
#Separating feature and Target
x = df.drop(columns=['Salary']) #feature
y = df['Salary']   #target

*Scaling is not needed for decision tree*

In [47]:
#Scaling data
#from sklearn.preprocessing import StandardScaler
#sc = StandardScaler()
#x = sc.fit_transform(x)
#x

In [48]:
#Splitting data
#from sklearn.model_selection import train_test_split
#x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [49]:
#print("train data size (features):",len(x_train))
#print("train data size (target):",len(y_train))

#print("test data size (features):",len(x_test))
#print("test data size (target):",len(y_test))

# **Model**

https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html

***K Fold Cross Validation***

In [52]:
from sklearn.tree import DecisionTreeRegressor
DTrgsr = DecisionTreeRegressor(criterion='squared_error',
                               max_depth= 10,
                               min_samples_split= 2,
                               min_samples_leaf=1,
                               max_features=None,
                               max_leaf_nodes=None,
                               min_impurity_decrease=0.0,
                               )

In [62]:
from sklearn.model_selection import cross_val_score, KFold
#from sklearn.model_selection import KFold

k_fold = KFold(10)

result1 = cross_val_score(DTrgsr,x,y,cv=k_fold, scoring='neg_mean_squared_error')
result2 = cross_val_score(DTrgsr,x,y,cv=k_fold, scoring='neg_mean_absolute_percentage_error')
result3 = cross_val_score(DTrgsr,x,y,cv=k_fold, scoring='r2')

In [72]:
# Fnd what metrics support
import sklearn
sklearn.metrics.get_scorer_names()


['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'd2_absolute_error_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_max_error',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_negative_likelihood_ratio',
 'neg_root_mean_squared_error',
 'neg_root_mean_squared_log_error',
 'normalized_mutual_info_score',
 'positive_likelihood_ratio',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 're

In [64]:
result1 # negative MSE

array([-6.49516859e+08, -9.17806063e+08, -3.74431641e+08, -2.21146997e+08,
       -1.33151351e+08, -1.72811936e+08, -1.35956848e+08, -7.61804882e+07,
       -8.25562997e+07, -2.43316850e+08])

In [65]:
result2 # negative MAE

array([-0.19916445, -0.19093008, -0.17456405, -0.11895339, -0.10725051,
       -0.11070287, -2.77656799, -0.05316701, -0.05063269, -0.05766894])

In [66]:
overall_MAE_percentage = (sum(result2)/len(result2))
print(abs(overall_MAE_percentage)*100)

38.39601988426483


In [70]:
result3 # R2

array([0.73213625, 0.65261872, 0.82840483, 0.84170729, 0.93101084,
       0.92112801, 0.93883852, 0.97147089, 0.95846647, 0.85570615])

In [71]:
overall_R2 = sum(result3)/len(result3)
print(overall_R2)

0.8631487982033585
