In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
%matplotlib inline

from pandas_profiling import ProfileReport

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRFRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [2]:
data=pd.read_csv('age_of_marriage_data.csv')
data.head()

Unnamed: 0,id,gender,height,religion,caste,mother_tongue,profession,location,country,age_of_marriage
0,1,female,"5'4""",,others,Telugu,,London,United Kingdom,21.0
1,2,male,"5'7""",Jain,Shwetamber,Gujarati,Doctor / Healthcare Professional,Fairfax- VA,USA,32.0
2,3,male,"5'7""",Hindu,Brahmin,Hindi,Entrepreneurs / Business,Begusarai,India,32.0
3,4,female,"5'0""",Hindu,Thakur,Hindi,Architect,Mumbai,India,30.0
4,5,male,"5'5""",Christian,Born Again,Malayalam,Sales Professional / Marketing,Sulthan Bathery,India,30.0


In [3]:
def check_shape(df):
    rows,columns= df.shape
    print('Total number of rows:',rows)
    print('totla nuber of Columns:',columns)

In [4]:
check_shape(data)

Total number of rows: 2567
totla nuber of Columns: 10


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2567 entries, 0 to 2566
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               2567 non-null   int64  
 1   gender           2538 non-null   object 
 2   height           2449 non-null   object 
 3   religion         1932 non-null   object 
 4   caste            2425 non-null   object 
 5   mother_tongue    2403 non-null   object 
 6   profession       2237 non-null   object 
 7   location         2412 non-null   object 
 8   country          2551 non-null   object 
 9   age_of_marriage  2548 non-null   float64
dtypes: float64(1), int64(1), object(8)
memory usage: 200.7+ KB


In [6]:
data.describe()

Unnamed: 0,id,age_of_marriage
count,2567.0,2548.0
mean,1284.0,29.648352
std,741.173394,2.802414
min,1.0,20.0
25%,642.5,28.0
50%,1284.0,30.0
75%,1925.5,32.0
max,2567.0,36.0


In [None]:
prof = ProfileReport(data)
prof.to_file(output_file='output.html')

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))

In [None]:
for i in data.columns:
    print(data[i].value_counts())
    print('-----------------------------------------')

In [None]:
# here we can drop the useless columns
# id columns has single value it wont help in our prediction so we can remove it

data.drop('id',axis=1,inplace=True)

In [None]:
for i in data.columns:
    print(data[i].value_counts())
    print('-----------------------------------------')

In [None]:
data.isna().sum()/data.shape[0]*100 # checking the percentage of null values

In [None]:
data.shape

In [None]:
data.dropna(inplace=True)

In [None]:
data.shape

In [None]:
data['profession'].unique()

In [None]:
data['height'] # here height is in foot and inch so we have to convert it to cm or m

In [None]:
data.loc[1,'height']

In [None]:
data.loc[1,'height'].split('\'')

In [None]:
data.loc[1,'height'].split('\'')[0]

In [None]:
int(data.loc[1,'height'].split('\'')[0])*30.48

In [None]:
int(data.loc[1,'height'].split('\'')[1].replace('"',""))*2.54

In [None]:
def height_convert(x):
    return int(x.split('\'')[0])*30.48+int(x.split('\'')[1].replace('"',""))*2.54

In [None]:
data['height_cms']= data['height'].apply(height_convert)

In [None]:
data.head()

In [None]:
data.drop('height',axis=1,inplace=True)

In [None]:
data.head()

In [None]:
# performing label encoding for other 

In [None]:
le= LabelEncoder()

In [None]:
data.loc[:,['gender','religion','caste','mother_tongue','profession','location','country']]=data.loc[:,['gender','religion','caste','mother_tongue','profession','location','country']].apply(le.fit_transform)

In [None]:
data.head()

In [None]:
X= data.drop('age_of_marriage',axis=1)
y= data['age_of_marriage']

X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2,random_state=0)

models={'Linear regression': LinearRegression(),
       'Tree': DecisionTreeRegressor(),
       'Random_forest': RandomForestRegressor(n_estimators=80,max_depth=11),
       'Ada_boost': AdaBoostRegressor(),
       'Xgboost': XGBRegressor(),
       'Xgbr_boost': XGBRFRegressor()}

train_score={}
test_score={}

for name,model in models.items():
    model.fit(X_train,y_train)
    train_score[name]=model.score(X_train,y_train)
    test_score[name]=model.score(X_test,y_test)

In [None]:
train_score

In [None]:
test_score

In [None]:
result= pd.DataFrame(train_score,index=['Train_score'])
result= result.T
result['Test_score']= test_score.values()
result['Difference']= result['Train_score']-result['Test_score']
result

In [None]:
result.plot.bar();

In [None]:
rf= RandomForestRegressor(n_estimators=80,max_depth=11).fit(X_train,y_train)

In [None]:
y_pred= rf.predict(X_test)

In [None]:
mse= mean_squared_error(y_test,y_pred)
mae= mean_absolute_error(y_test,y_pred)
r_squre= r2_score(y_test,y_pred)
print('mean_squared_error:',mse)
print('\n')
print('mean_absolute_error:',mae)
print('\n')
print('r_square:',r_squre)