<a href="https://colab.research.google.com/github/ivonnics/Machine-Learning/blob/master/EDUCATION_DATA_ANALYSIS_DEMO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Taken from: https://matplotlib.org/matplotblog/posts/create-ridgeplots-in-matplotlib/

and

https://towardsdatascience.com/exploratory-data-analysis-with-pandas-profiling-de3aae2ddff3

and

https://towardsdatascience.com/a-quick-guide-on-descriptive-statistics-using-pandas-and-seaborn-2aadc7395f32

In [0]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KernelDensity
import seaborn as sns
from pandas_profiling import ProfileReport
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as grid_spec

In [0]:
data = pd.read_csv("https://raw.githubusercontent.com/petermckeever/mock-data/master/datasets/mock-european-test-results.csv")
prof = ProfileReport(data)
print(data.info())
print(data.country.value_counts())
print(data.head())

# Descriptive Analysis

In [0]:
#prof

In [0]:
print('Mode = ',data.score.mode())
data.describe()

In [0]:
dataG=data.loc[(data.country=='Germany'),['score','country','age','sex']]
print(dataG.describe())
print(dataG.head())
dataG.hist(figsize=(12,5))
plt.show()

# Graphics

In [0]:
countries = [x for x in np.unique(data.country)]
colors = ['#0000ff', '#3300cc', '#660099', '#990066', '#cc0033', '#ff0000']

gs = grid_spec.GridSpec(len(countries),1)
fig = plt.figure(figsize=(14,7))

i = 0

ax_objs = []
for country in countries:
    country = countries[i]
    x = np.array(data[data.country == country].score)
    x_d = np.linspace(0,1, 1000)

    kde = KernelDensity(bandwidth=0.03, kernel='gaussian')
    kde.fit(x[:, None])

    logprob = kde.score_samples(x_d[:, None])

    # creating new axes object
    ax_objs.append(fig.add_subplot(gs[i:i+1, 0:]))

    # plotting the distribution
    ax_objs[-1].plot(x_d, np.exp(logprob),color="#f0f0f0",lw=1)
    ax_objs[-1].fill_between(x_d, np.exp(logprob), alpha=1,color=colors[i])


    # setting uniform x and y lims
    ax_objs[-1].set_xlim(0,1)
    ax_objs[-1].set_ylim(0,2.5)

    # make background transparent
    rect = ax_objs[-1].patch
    rect.set_alpha(0)

    # remove borders, axis ticks, and labels
    ax_objs[-1].set_yticklabels([])

    if i == len(countries)-1:
        ax_objs[-1].set_xlabel("Test Score", fontsize=16,fontweight="bold")
    else:
        ax_objs[-1].set_xticklabels([])

    spines = ["top","right","left","bottom"]
    for s in spines:
        ax_objs[-1].spines[s].set_visible(False)

    adj_country = country.replace(" ","\n")
    ax_objs[-1].text(-0.02,0,adj_country,fontweight="bold",fontsize=14,ha="right")


    i += 1

gs.update(hspace=-0.8)

fig.text(0.07,0.85,"Distribution of Aptitude Test Results from 18 – 24 year-olds",fontsize=20)

plt.tight_layout()
plt.show()

In [0]:
%matplotlib inline

sns.set(style="whitegrid")
plt.figure(figsize=(10,8))
ax = sns.boxplot(x='country', y='score', data=data, orient="v")

In [0]:
sns.set(style="whitegrid")
plt.figure(figsize=(10,8))
ax = sns.boxplot(x='country', y='age', data=data, orient="v")

In [0]:
type_counts = data['country'].value_counts()
df2 = pd.DataFrame({'Paises': type_counts}, 
                     index = ['United Kingdom', 'Spain', 'Germany','France', 'Italy', 'Ireland']
                   )
df2.plot.pie(y='Paises', figsize=(10,10), autopct='%1.1f%%')

# Crosstab

In [0]:
pd.set_option('display.max_columns', 500)
testing=pd.crosstab(data.country, data.score)
testing

In [0]:
fig, ax = plt.subplots(figsize=(20,5))         # Sample figsize in inches
sns.heatmap(pd.crosstab([data.country], [data.score]),
            cmap="YlGnBu", annot=True, cbar=False)

In [0]:
testing2=pd.crosstab([data.country, data.sex],data.score)
testing2

In [0]:
fig, ax = plt.subplots(figsize=(20,7))         # Sample figsize in inches
sns.heatmap(pd.crosstab([data.country,data.sex], [data.score]),
            cmap="YlGnBu", annot=True, cbar=False)

In [0]:
testing3=pd.crosstab([data.country, data.age], data.score)
testing3

In [0]:
fig, ax = plt.subplots(figsize=(19,8))         # Sample figsize in inches
sns.heatmap(pd.crosstab([data.country,data.age], [data.score]),
            cmap="YlGnBu", annot=True, cbar=False)

In [0]:
dfUKM = data.loc[(data['country']=="United Kingdom")&(data['sex']=='male'), ['score']]
dfUKF = data.loc[(data['country']=="United Kingdom")&(data['sex']=='female'), ['score']]
dfSPM = data.loc[(data['country']=="Spain")&(data['sex']=='male'), ['score']]
dfSPF = data.loc[(data['country']=="Spain")&(data['sex']=='female'), ['score']]
dfITM = data.loc[(data['country']=="Italy")&(data['sex']=='male'), ['score']]
dfITF = data.loc[(data['country']=="Italy")&(data['sex']=='female'), ['score']]
dfIRM = data.loc[(data['country']=="Ireland")&(data['sex']=='male'), ['score']]
dfIRF = data.loc[(data['country']=="Ireland")&(data['sex']=='female'), ['score']]
dfGEM = data.loc[(data['country']=="Germany")&(data['sex']=='male'), ['score']]
dfGEF = data.loc[(data['country']=="Germany")&(data['sex']=='female'), ['score']]
dfFRM = data.loc[(data['country']=="France")&(data['sex']=='male'), ['score']]
dfFRF = data.loc[(data['country']=="France")&(data['sex']=='female'), ['score']]
# Data to plot
labels = ["United Kingdom", "Spain", "Italy", "Ireland", "Germany","France" ]
sizes = [len(dfUKM)+len(dfUKF), len(dfSPM)+len(dfSPF), len(dfITM)+len(dfITF), len(dfIRM)+len(dfIRF), len(dfGEM)+len(dfGEF), len(dfFRM)+len(dfFRF)]
labels_gender = ['UKM','UKF','SPM','SPF', 'ITM','ITF','IRM','IRF', 'GEM','GEF','FRM','FRF']
sizes_gender = [len(dfUKM),len(dfUKF),len(dfSPM),len(dfSPF), len(dfITM), len(dfITF), len(dfIRM), len(dfIRF), len(dfGEM), len(dfGEF), len(dfFRM), len(dfFRF)]
colors = ['#ff6666', '#66b3ff','#3f00f0','#ffcc00', '#aa00f0','#aaffe6']
colors_gender = ['#c2c2f0','#ffb3e6', '#c2c2f0','#ffb3e6','#c2c2f0','#ffb3e6','#c2c2f0','#ffb3e6','#c2c2f0','#ffb3e6','#c2c2f0','#ffb3e6']
explode = (0.2,0.2,0.2,0.2,0.2,0.2) 
explode_gender = (0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1)
#Plot
plt.pie(sizes, labels=labels, colors=colors, startangle=90,frame=True, autopct='%1.1f%%',explode=explode,radius=3)
plt.pie(sizes_gender,labels=labels_gender,colors=colors_gender,startangle=90, autopct='%1.1f%%',explode=explode_gender,radius=2 )
#Draw circle
centre_circle = plt.Circle((0,0),1.5,color='black', fc='white',linewidth=0)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
 
#fig, ax = figsize=(15,10)        # Sample figsize in inches

plt.axis('equal')
plt.tight_layout()
plt.show()
print(len(dfUKM),len(dfUKF),len(dfSPM),len(dfSPF), len(dfITM), len(dfITF), len(dfIRM), len(dfIRF), len(dfGEM), len(dfGEF), len(dfFRM), len(dfFRF))

### Mean and Median:

In [0]:
import statistics
dfmedian = pd.DataFrame(columns=['Mean','Median'], index=['UK males','UK females','Spain males','Spain females','Italy males','Italy females','Ireland males','Ireland females','Germany males','Germany females','France males','France females'])
dfmedian.loc['UK males'] = [dfUKM.score.mean(),dfUKM.score.median()]
dfmedian.loc['UK females'] = [dfUKF.score.mean(),dfUKF.score.median()]
dfmedian.loc['Spain males'] = [dfSPM.score.mean(),dfSPM.score.median()]
dfmedian.loc['Spain females'] = [dfSPF.score.mean(),dfSPF.score.median()]
dfmedian.loc['Italy males'] = [dfITM.score.mean(),dfITM.score.median()]
dfmedian.loc['Italy females'] = [dfITF.score.mean(),dfITF.score.median()]
dfmedian.loc['Ireland males'] = [dfIRM.score.mean(),dfIRM.score.median()]
dfmedian.loc['Ireland females'] = [dfIRF.score.mean(),dfIRF.score.median()]
dfmedian.loc['Germany males'] = [dfGEM.score.mean(),dfGEM.score.median()]
dfmedian.loc['Germany females'] = [dfGEF.score.mean(),dfGEF.score.median()]
dfmedian.loc['France males'] = [dfFRM.score.mean(),dfFRM.score.median()]
dfmedian.loc['France females'] = [dfFRF.score.mean(),dfFRF.score.median()]
dfmedian

In [0]:
dfmedian.plot.bar(figsize=(16,5),rot=30)