In [4]:
#Importing Important Libraries.

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.gridspec as grid_spec
import seaborn as sns


AttributeError: module 'matplotlib' has no attribute 'rcParams'

#### 1.1 - Business/situation objectives must be logical, and in line with data mining goals and the business success criteria. Must be conveyed in a clear manner.

#### 1.2 - Situation assessment must effectively describe the resources, requirements, assumptions, constraints, risks and contingencies of the project.

#### 1.3 - Data mining goals must be achievable and closely aligned with the business objectives/success criteria. Must be conveyed in a clear manner.

#### 向Envision2030 Goal 3: Good Health and Well-being靠拢，本文主要运用世界幸福指数报告数据，预测幸福指数，结果显示医疗、教育支出高的国家，幸福指数越高。提供了针对国家对Goal 3的参考意义。

#### 1.4 - The project plan must address how each phase of the project will be carried out for the current iteration. A day-to-day timeline must be proposed within the project plan.

#### Timeline建议与之前一致

In [None]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('F:\spark-3.3.0-bin-hadoop2')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('linear_regression_adv').getOrCreate()

### Import Datasets

#### 2.1 - Collect initial data and describe where the data was collected from, how it was collected and any issues encountered during collection.

#### Collect from Kaggle https://www.kaggle.com/datasets/ajaypalsinghlo/world-happiness-report-2021

In [None]:
data = spark.read.csv("world-happiness-report-2021.csv",inferSchema=True,header=True)

In [None]:
data.printSchema()

In [None]:
data.head()

In [None]:
for item in data.head():
    print(item)

In [None]:
df = data.toPandas()

In [None]:
#df=pd.read_csv('world-happiness-report-2021.csv')
df.head()
original=df.copy()
def highlight_max(s):    
    is_max = s == s.max()
    return ['background-color: limegreen' if v else '' for v in is_max]

df.style.apply(highlight_max, subset=['Ladder score','Logged GDP per capita','Social support','Healthy life expectancy','Freedom to make life choices','Generosity','Perceptions of corruption'])

### Checking out the shape of the dataset

In [None]:
# We select only the relevant columns

cols=['Country name', 'Regional indicator', 'Ladder score','Logged GDP per capita', 'Social support', 
      'Healthy life expectancy', 'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption']


df=df[cols]
df.shape

#### 2.2 – Data description must describe the format, quantity, fields and surface-level features of the data.

In [None]:
# Check the number of non-null values in the dataframe
df.info()

#### 2.3 - Data exploration must assist readers in understanding the data through the usage of strong visualisations (visualising the raw data). Must be communicated in a clear manner and explicitly linked to the rest of the steps.

#### 2.4 - Data quality must be verified by checking for errors, missing values, and data quality patterns explicitly

In [None]:
# Check Null values
happy_df = df
happy_df.isnull().sum()

In [None]:
# Obtain the Statistical summary of the dataframe
happy_df.describe()

#### 3. Data Preparation As a general guide: 3.1 - Data must be selected effectively. Goals, data quality, technical constraints, among other factors should be explicitly considered. 3.2 - To clean the data, issues must be made explicit, then explicitly resolved. 3.3 - Data must be appropriately constructed through the creation of new features/variables, and/or data repositories/tables. 3.4 - Integration must take place. This includes effectively merging data from various sources. 3.5 - Reformatting includes changing the formats of different data sources and trimming content, among other steps specific to the data.

In [None]:
#Rename Columns for Interpretability

df = happy_df
df.rename({'Country name':'country','Regional indicator':'region',
          'Logged GDP per capita':'GDP_per_capita','Social support':'social_score',
          'Healthy life expectancy':'life_expectancy',
           'Freedom to make life choices':'freedom_choice','Generosity':'generosity',
          'Perceptions of corruption':'corruption_score',
          'Ladder score':'happiness_score'},axis=1,inplace=True)

In [None]:
# check the number of duplicated entries in the dataframe
# since there are no duplicates, no further action is required
df.duplicated().sum()

In [None]:
#Check how many countries
df['region'].unique()

### Plot Relation between happiness and GDP

In [None]:
plt.figure(figsize=(12,6))
sns.scatterplot(x='GDP_per_capita',y='happiness_score',hue='region',s=85,data=df)
plt.legend(loc='best',fontsize=10)
plt.title('GDP per capita on Happiness',fontsize=20)

### We can see the happiness have positive relationship with GDP

### Relation between happiness and life_expectancy

In [None]:
plt.figure(figsize=(12,6))
sns.scatterplot(x='life_expectancy',y='happiness_score',hue='region',s=85,data=df)
plt.legend(loc='best',fontsize=10)
plt.title('Life Expectancy on Happiness',fontsize=20)

### Relation between freedom of Choice and GDP

In [None]:
plt.figure(figsize=(12,6))
sns.scatterplot(x='freedom_choice',y='happiness_score',hue='region',s=80,data=df)
plt.legend(loc='best',fontsize=10)
plt.title('Dependence of freedom of Choice on Happiness',fontsize=20)

### Correlation

In [None]:
plt.figure(figsize=(6,5))
sns.heatmap(df.corr(),cmap='coolwarm',annot=True)

### We can see that the life expectancy is highly correlated to the GDP and social score.

# Regression


In [None]:
import seaborn as seabornInstance
# TRAIN TEST SPLIT
from sklearn.model_selection import train_test_split
# STATS
from sklearn import datasets, metrics
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingRegressor


#### 4.2 - The data must be projected through the use of statistical transformations (such as taking the log of a distribution).

In [None]:
df_2021 = happy_df
df_2021 = df_2021.drop(['region'],axis=1)

In [None]:
df_2021['Year'] = 2021 #add year column
df_2021.head()

In [None]:
df_all=pd.read_csv('world-happiness-report.csv')
#df2 = df2.groupby(['Country name']).mean()
df_all

In [None]:
df_all.drop(['Positive affect','Negative affect'],axis=1,inplace = True)

In [None]:
# We select only the relevant columns

df_all.rename({'Country name':'country','year':'year','Regional indicator':'region',
          'Log GDP per capita':'GDP_per_capita','Social support':'social_score',
          'Healthy life expectancy at birth':'life_expectancy',
           'Freedom to make life choices':'freedom_choice','Generosity':'generosity',
          'Perceptions of corruption':'corruption_score',
          'Life Ladder':'happiness_score'},axis=1,inplace=True)

df_all.isnull().sum()

In [None]:
df_all = df_all.dropna(axis = 0)

In [None]:
df_all.isnull().sum()
df_all

In [None]:
def PlotScore(y_train, y_train_pred, y_test, y_test_pred):
  '''
  Plot visual of acutal and predicted for train & test data
  '''
  plt.figure(figsize = [13.66, 6])
  plt.subplot(1, 2, 1)
  sns.lineplot(x = y_train, y = y_train_pred, marker = 'o')
  plt.xlabel('Actual')
  plt.ylabel('Predicted')
  plt.title('For Train Data')

  plt.subplot(1, 2, 2)
  sns.lineplot(x = y_test, y = y_test_pred, marker = 'o')
  plt.xlabel('Actual')
  plt.ylabel('Predicted')
  plt.title('For Test Data')
  plt.show()

#### 5.1 - Match and discuss DM methods within the context of the DM objectives.

#### 5.2 - Select the appropriate DM method(s) in a logical manner. The selected DM method must be in line with the data mining goal/success criteria.

#### Choose Linear Regression

#### Single Linear Regression for 2021 datasets

In [None]:
evaluation = pd.DataFrame({'Model':[],
                          'Dataset': [],
                          'Root Mean Squared Error (RMSE)': [],
                          'R-squared (training)': [],
                          'R-squared (test)':[],
                           '5-Fold Cross Validation(RMSE)':[]
                          })

In [None]:
df_2021.columns

In [None]:
train_data, test_data = train_test_split(df_2021, train_size = 0.8, random_state = 128)

lr = LinearRegression()
X_train = np.array(train_data['GDP_per_capita'],
                   dtype = pd.Series).reshape(-1,1)
y_train = np.array(train_data['happiness_score'], dtype = pd.Series)
lr.fit(X_train, y_train)

X_test = np.array(test_data['GDP_per_capita'], 
                    dtype = pd.Series).reshape(-1,1)
y_test = np.array(test_data['happiness_score'], dtype = pd.Series)

pred = lr.predict(X_test)
#ROOT MEAN SQUARED ERROR
rmsesm = float(format(np.sqrt(metrics.mean_squared_error(y_test,pred)),'.3f'))
#R-SQUARED (TRAINING)
rtrsm = float(format(lr.score(X_train, y_train),'.3f'))
#R-SQUARED (TEST)
rtesm = float(format(lr.score(X_test, y_test),'.3f'))
cv = abs(float(format(cross_val_score(lr,df_2021[['GDP_per_capita']],df_2021['happiness_score'],cv=5,scoring='neg_mean_squared_error').mean(),'.3f')))

print ("Average Score for Test Data: {:.3f}".format(y_test.mean()))
print('Intercept: {}'.format(lr.intercept_))
print('Coefficient: {}'.format(lr.coef_))
print('Happiness score = ',np.round(lr.intercept_,4),
      '+',np.round(lr.coef_,4),'* GDP')

r = evaluation.shape[0]
evaluation.loc[r] = ['Simple Linear Regression','2021 datasets',rmsesm,rtrsm,rtesm,cv]
evaluation

#### 6.1 Conduct exploratory analysis of DM algorithms within the context of the DM objectives. Then, discuss the analysis.
#### 6.2 - Select algorithm(s) in a logical manner based on the exploratory analysis and discussion.
#### 6.3 - Model(s) must be selected/built, and the appropriate algorithm/model parameter(s) must be selected.

#### GDP and happiness shows positive relationship

In [None]:
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)
PlotScore(y_train, y_train_pred, y_test, y_test_pred)

# Single Linear Regression for Large datasets

In [None]:
train_data, test_data = train_test_split(df_all, train_size = 0.8, random_state = 128)

lr = LinearRegression()
X_train = np.array(train_data['GDP_per_capita'],
                   dtype = pd.Series).reshape(-1,1)
y_train = np.array(train_data['happiness_score'], dtype = pd.Series)
lr.fit(X_train, y_train)

X_test = np.array(test_data['GDP_per_capita'], 
                    dtype = pd.Series).reshape(-1,1)
y_test = np.array(test_data['happiness_score'], dtype = pd.Series)

pred = lr.predict(X_test)
#ROOT MEAN SQUARED ERROR
rmsesm = float(format(np.sqrt(metrics.mean_squared_error(y_test,pred)),'.3f'))
#R-SQUARED (TRAINING)
rtrsm = float(format(lr.score(X_train, y_train),'.3f'))
#R-SQUARED (TEST)
rtesm = float(format(lr.score(X_test, y_test),'.3f'))
cv = abs(float(format(cross_val_score(lr,df_all[['GDP_per_capita']],df_all['happiness_score'],cv=5,scoring='neg_mean_squared_error').mean(),'.3f')))

print ("Average Score for Test Data: {:.3f}".format(y_test.mean()))
print('Intercept: {}'.format(lr.intercept_))
print('Coefficient: {}'.format(lr.coef_))
print('Happiness score = ',np.round(lr.intercept_,4),
      '+',np.round(lr.coef_,4),'* GDP')

r = evaluation.shape[0]
evaluation.loc[r] = ['Simple Linear Regression','Large Datasets',rmsesm,rtrsm,rtesm,cv]
evaluation

In [None]:
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)
PlotScore(y_train, y_train_pred, y_test, y_test_pred)

# Mutiple Linear Regression for Large datasets

In [None]:
df_all.columns

In [None]:
# MULTIPLE LINEAR REGRESSION 1
train_data_dm,test_data_dm = train_test_split(df_all,train_size = 0.8,random_state=128)
independent_var = ['GDP_per_capita', 'social_score',
       'life_expectancy', 'freedom_choice', 'generosity', 'corruption_score']
multi_Lr = LinearRegression()


X_train = np.array(train_data_dm[independent_var],
                   dtype = pd.Series).reshape(-1,6)

y_train = np.array(train_data_dm['happiness_score'], dtype = pd.Series)

multi_Lr.fit(train_data_dm[independent_var],train_data_dm['happiness_score'])

print('Intercept: {}'.format(multi_Lr.intercept_))
print('Coefficients: {}'.format(multi_Lr.coef_))
print('Happiness score = ',np.round(multi_Lr.intercept_,4),
      '+',np.round(multi_Lr.coef_[0],4),'∗ GDP_per_capita',
      '+',np.round(multi_Lr.coef_[1],4),'* social_score', 
      '+',np.round(multi_Lr.coef_[2],4),'* life_expectancy',
      '+',np.round(multi_Lr.coef_[3],4),'* freedom_choice',
       '+',np.round(multi_Lr.coef_[4],4),'* generosity',
      '+',np.round(multi_Lr.coef_[5],4),'* corruption_score')

X_test = np.array(test_data_dm[independent_var], 
                    dtype = pd.Series).reshape(-1,6)
y_test = np.array(test_data_dm['happiness_score'], dtype = pd.Series)




pred = multi_Lr.predict(X_test)
rmsecm = float(format(np.sqrt(metrics.mean_squared_error(
                       test_data_dm['happiness_score'],pred)),'.7f'))
rtrcm = float(format(multi_Lr.score(
                        train_data_dm[independent_var],
                        train_data_dm['happiness_score']),'.3f'))
rtecm = float(format(multi_Lr.score(
                        test_data_dm[independent_var],
                        test_data_dm['happiness_score']),'.3f'))
cv = abs(float(format(cross_val_score(multi_Lr,df_all[independent_var],df_all['happiness_score'],cv=5,scoring='neg_mean_squared_error').mean(),'.3f')))
r = evaluation.shape[0]
evaluation.loc[r] = ['Multiple Linear Regression-1','Large Datasets',rmsesm,rtrsm,rtesm,cv]
evaluation.sort_values(by = '5-Fold Cross Validation(RMSE)', ascending=False)

In [None]:
y_train_pred = multi_Lr.predict(X_train)
y_test_pred = multi_Lr.predict(X_test)
PlotScore(y_train, y_train_pred, y_test, y_test_pred)

#### We can see the life_expectancy(health care) plays important role for happiness