In [None]:
# MBA DATA SCIENCE & ANALYTICS USP/Esalq
#Supervised ML - SIMPLE AND MULTIPLE REGRESSION ANALYSIS
#Isabela Pereira Lima Dias

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np
from statsmodels.iolib.summary2 import summary_col
from skimage import io
import plotly.graph_objs as go
from scipy.stats import pearsonr
from sklearn.preprocessing import LabelEncoder

In [None]:
#dataset
df = pd.read_csv("corruption.csv", delimiter= ",")
df.rename(columns = {'pais':'country', 'regiao':'region'}, inplace = True)
df

Data visualization

In [None]:
#Distributions
df['cpi'].plot(kind='hist', bins=20, title='cpi')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
#Values
df['cpi'].plot(kind='line', figsize=(8, 4), title='cpi')
plt.gca().spines[['top', 'right']].set_visible(False)

In [None]:
#Faceted distributions
figsize = (12, 1.2 * len(df['region'].unique()))
plt.figure(figsize=figsize)
sns.violinplot(df, x='cpi', y='region', inner='stick', palette='Dark2')
sns.despine(top=True, right=True, bottom=True, left=True)

In [None]:
#Categorical distributions
df.groupby('region').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
df.info()
df.describe()

In [None]:
_df_0['cpi'].plot(kind='hist', bins=20, title='cpi')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
df.groupby('region').describe()

In [None]:
#Frequency table
count = df['region'].value_counts(dropna=False)
percent = df['region'].value_counts(dropna=False, normalize=True)
pd.concat([count,percent],axis=1, keys=["count", "%"],sort=False)

In [None]:
# REGION AS NUMERIC ( FOR DUMMY VARIABLES)
label_encoder = LabelEncoder()
df['numeric_region'] =label_encoder.fit_transform(df['region'])
df['numeric_region'] = df['numeric_region'] + 1
df.head(10)


In [None]:
_df_4.groupby('region').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
_df_5.plot(kind='scatter', x='cpi', y='numeric_region', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
figsize = (12, 1.2 * len(_df_8['region'].unique()))
plt.figure(figsize=figsize)
sns.violinplot(_df_8, x='cpi', y='region', inner='stick', palette='Dark2')
sns.despine(top=True, right=True, bottom=True, left=True)

In [None]:
_df_3['numeric_region'].plot(kind='hist', bins=20, title='numeric_region')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
df['numeric_region'].describe()

Note from the above example that: treating the variable region as numeric characterizes an arbitrary ponderation. For instance, See Categorical distributions graphs. The solution is to treat the variable 'region' as dummy!!


In [None]:
df_dummies = pd.get_dummies(df, columns=['region'], drop_first=True)
df_dummies.head(10)

In [None]:
# Regression model with n-1 dummies
model_dummies = sm.OLS.from_formula( "cpi ~ region_Asia + \
region_EUA_e_Canada	 + \
region_Europa + \
region_Oceania",df_dummies).fit()

In [None]:
model_dummies.summary()

In [None]:
# Alternative way -> when we have many dummies on the dataset
columns_list = list(df_dummies.drop(columns=['cpi','country','numeric_region']).columns)
model_dummies_formula = ' + '.join(columns_list)
model_dummies_formula = "cpi ~ " + model_dummies_formula
print("Formula: ",model_dummies_formula)

model_dummies = sm.OLS.from_formula(model_dummies_formula,
                                               df_dummies).fit()

In [None]:
model_dummies.summary()

In [None]:
# Graphs of our dummy model
df_dummies['fitted'] = model_dummies.fittedvalues
df_dummies.head()

In [None]:
_df_26.groupby('country').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
#Plotting our model
from scipy import interpolate
plt.figure(figsize=(10,10))
df2 = df_dummies[["numeric_region", "fitted"]].groupby(['numeric_region']).median().reset_index()
x=df2["numeric_region"]
y = df2["fitted"]

tck = interpolate.splrep(x,y, k=2)
xnew = np.arange(1,5,0.1)
ynew = interpolate.splev(xnew, tck, der=0)


def label_point(x, y, val, ax):
    a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
    for i, point in a.iterrows():
        ax.text(point['x']+.02, point['y'], str(point['val']) + " " + str(point['y']))

plt.scatter(df_dummies['numeric_region'], df_dummies['cpi'])
plt.scatter(df_dummies['numeric_region'], df_dummies['fitted'])
plt.plot(xnew, ynew)
plt.title('Arbitrary ponderation')
plt.xlabel('Region')
plt.ylabel('Corruption Perception Index')
label_point(x = df['numeric_region'],
            y = df['cpi'],
            val = df['country'],
            ax = plt.gca())