In [None]:
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [None]:
data=pd.read_csv("happiness_score_dataset.csv")
data.head()

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
data.isnull().sum()

There is no null value present in the dataset

In [None]:
data.describe()

1. Mean and median are almost equal in Happiness Rank, Happiness Score, Economy, Health, Freedom, Dystopia Residual,
it means data is normally distributed in these columns.

2. Difference between Quartiles suggests there are outliers present in Standard Error, Family, Trust, Generosit and
Dystopia Residual

In [None]:
data.skew()

In [None]:
#Label Encoding for Country and Region Column
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data['Country'] = label_encoder.fit_transform(data['Country'])
data['Region'] = label_encoder.fit_transform(data['Region'])


In [None]:
datacor=data.corr()
datacor

In [None]:
#Multivariate Analysis
plt.figure(figsize=(10,10))
sns.heatmap(datacor,cmap='YlOrRd_r',annot=True)

In [None]:
#Univariate Analysis
plt.figure(figsize=(20,25),facecolor='white')
plotnumber=1
for column in data:
    if plotnumber<=12:
        ax=plt.subplot(4,4,plotnumber)
        sns.boxplot(data[column])
        plt.xlabel(column,fontsize=20)
    plotnumber+=1
plt.tight_layout()   

We can see outtliers are present in the data.

In [None]:
#Bivariate Analysis
plt.figure(figsize=(20,25),facecolor='white')
plotnumber=1
for column in data:
    if plotnumber<=12:
        ax=plt.subplot(4,4,plotnumber)
        sns.distplot(data[column])
        plt.xlabel(column,fontsize=20)
    plotnumber+=1
plt.tight_layout()    

Most of the data is normally distributed expect Trust, Generosity,Standard Error and Family

In [None]:
#Removing the outliers:
from scipy.stats import zscore
z=np.abs(zscore(data))
z

In [None]:
data_new=data[(z<3).all(axis=1)]

In [None]:
data_new.shape

9 rows are deleted after removing the outliers

In [None]:
data.shape

In [None]:
#Splitting the data for training and testing the model:
x=data_new.drop('Happiness Score',axis=1)
x

In [None]:
y=data_new.iloc[:,3:4]
y

In [None]:
plt.figure(figsize=(20,30),facecolor='white')
plotnumber=1
for column in x:
    if plotnumber<=12:
        ax=plt.subplot(4,4,plotnumber)
        plt.scatter(x[column],y)
        plt.xlabel(column,fontsize=20)
        plt.ylabel('Happiness Score',fontsize=20)
    plotnumber+=1
plt.tight_layout()   

There is lack of Trust and Generosity in most of the countries.

Economy, Family, Health and Freedom have most positive impact on happiness score.

In [None]:
x.shape

In [None]:
y.shape

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.30,random_state=40)

In [None]:
lm=LinearRegression()

In [None]:
lm.fit(x_train,y_train)

In [None]:
lm.coef_

In [None]:
lm.intercept_

In [None]:
lm.score(x_train,y_train)

In [None]:
pred=lm.predict(x_test)
print("predicted result price: ", pred)
print("actual price: ",y_test)

In [None]:
print("error: ")
print("Mean absolute error: ",mean_absolute_error(y_test,pred))
print("Mean squared error: ",mean_squared_error(y_test,pred))
print("Root mean squared error: ",np.sqrt(mean_squared_error(y_test,pred)))

In [None]:
from sklearn.metrics import r2_score
print(r2_score(y_test,pred))

In [None]:
import joblib
joblib.dump(lm,'lmfile.obj')