# Effectuez une prédiction de revenus

### Mission 3

- Question 9 :

### Importation des librairies <a class="anchor" id="librairies"></a>

In [1]:
# Importation des librairies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import scipy.stats as st
from collections import Counter
import time

In [2]:
ENCODAGE = 'UTF-8'

In [3]:
def generate_incomes(n, pj):
    # On génère les revenus des parents (exprimés en logs) selon une loi normale.
    # La moyenne et variance n'ont aucune incidence sur le résultat final (ie. sur le caclul de la classe de revenu)
    ln_y_parent = st.norm(0,1).rvs(size=n)
    # Génération d'une réalisation du terme d'erreur epsilon
    residues = st.norm(0,1).rvs(size=n)
    return np.exp(pj*ln_y_parent + residues), np.exp(ln_y_parent)

In [4]:
def quantiles(l, nb_quantiles):
    size = len(l)
    l_sorted = l.copy()
    l_sorted = l_sorted.sort_values()
    quantiles = np.round(np.arange(1, nb_quantiles+1, nb_quantiles/size) -0.5 +1./size)
    q_dict = {a:int(b) for a,b in zip(l_sorted,quantiles)}

    return pd.Series([q_dict[e] for e in l])

In [5]:
def compute_quantiles(y_child, y_parents, nb_quantiles):
    y_child = pd.Series(y_child)
    y_parents = pd.Series(y_parents)
    c_i_child = quantiles(y_child, nb_quantiles)
    c_i_parent = quantiles(y_parents, nb_quantiles)

    sample = pd.concat([y_child, y_parents, c_i_child, c_i_parent], axis=1)
    sample.columns = ["y_child", "y_parents", "c_i_child","c_i_parent"]

    return sample

In [6]:
def distribution(counts, nb_quantiles):
    distrib = []
    total = counts["counts"].sum()  

    if total == 0 :
        return [0] * nb_quantiles
    
    for q_p in range(1, nb_quantiles+1):
        subset = counts[counts.c_i_parent == q_p]
        if len(subset):
            nb = subset["counts"].values[0]
            distrib += [nb / total]
        else:
            distrib += [0]
    return distrib   

In [7]:
def conditional_distributions(sample, nb_quantiles):

    counts = sample.groupby(["c_i_child","c_i_parent"]).apply(len)
    counts = counts.reset_index()
    counts.columns = ["c_i_child","c_i_parent","counts"]    

    mat = []

    for child_quantile in np.arange(nb_quantiles)+1:
        subset = counts[counts.c_i_child == child_quantile]
        mat += [distribution(subset, nb_quantiles)]

    return np.array(mat) 



In [8]:
def plot_conditional_distributions(p, cd, nb_quantiles):
    plt.figure()
     # La ligne suivante sert à afficher un graphique en "stack bars", sur ce modèle : https://matplotlib.org/gallery/lines_bars_and_markers/bar_stacked.html

    cumul = np.array([0] * nb_quantiles)   
    for i, child_quantile in enumerate(cd):
        plt.bar(np.arange(nb_quantiles)+1, child_quantile, bottom=cumul, width=0.95, label = str(i+1) +"e")
        cumul = cumul + np.array(child_quantile)
    plt.axis([.5, nb_quantiles*1.3 ,0 ,1])
    plt.title("p=" + str(p))
    plt.legend()
    plt.xlabel("quantile parents")
    plt.ylabel("probabilité du quantile enfant")
    plt.show()

In [9]:
def proba_cond(c_i_parent, c_i_child, mat):
    return mat[c_i_child, c_i_parent]

In [10]:
# import du fichier
# Lecture
with open('df_gini_pj_indS', 'rb') as f2:
    df_gini_pj_indS = pickle.load(f2)

In [11]:
df_gini_pj_indS.head(2)

Unnamed: 0,country,country_code,Gj,income,pj,quantile
0,Albania,ALB,30.0,728.89795,0.535604,1
1,Albania,ALB,30.0,916.66235,0.535604,2


In [12]:
ds = df_gini_pj_indS.isin([np.inf, -np.inf]).sum()
ds

country         0
country_code    0
Gj              0
income          0
pj              0
quantile        0
dtype: int64

In [13]:
df_gini_pj_indS.shape

(5800000, 6)

In [14]:
df_gini_pj_indS.isnull().sum()

country         0
country_code    0
Gj              0
income          0
pj              0
quantile        0
dtype: int64

In [15]:
df_gini_pj_indS.dtypes

country          object
country_code     object
Gj              float64
income          float64
pj              float64
quantile          int64
dtype: object

In [16]:
df_gini_pj_indSF = df_gini_pj_indS.rename(columns={'quantile': 'c_i_child'})
df_gini_pj_indSF = df_gini_pj_indSF[['country','country_code', 'c_i_child', 'income', 'pj']]

df_gini_pj_indSF.head(3)

Unnamed: 0,country,country_code,c_i_child,income,pj
0,Albania,ALB,1,728.89795,0.535604
1,Albania,ALB,2,916.66235,0.535604
2,Albania,ALB,3,1010.916,0.535604


In [17]:
ds = df_gini_pj_indSF.isin([np.inf, -np.inf]).sum()
ds

country         0
country_code    0
c_i_child       0
income          0
pj              0
dtype: int64

In [18]:
# typage des variables pour occuper moins de place en mémoire
# Attention : income en float 16 c'est trop petit et induit des inf .....
typ_var = ['pj']
for var in typ_var :
     df_gini_pj_indSF[var] = df_gini_pj_indSF[var].astype(np.float32)    

In [19]:
df_gini_pj_indSF['c_i_child'] = df_gini_pj_indSF['c_i_child'].astype(np.int16)

In [20]:
typ_var_o = ['country', 'country_code']
for var in typ_var_o :
     df_gini_pj_indSF[var] = df_gini_pj_indSF[var].astype('category')    

In [21]:
df_gini_pj_indSF.dtypes

country         category
country_code    category
c_i_child          int16
income           float64
pj               float32
dtype: object

In [22]:
ds = df_gini_pj_indSF.isin([np.inf, -np.inf]).sum()
ds

country         0
country_code    0
c_i_child       0
income          0
pj              0
dtype: int64

In [23]:
type(df_gini_pj_indSF.country.unique().tolist())

list

In [24]:
df_gini_pj_indSF[df_gini_pj_indSF['country']=='South Africa'].value_counts()

country       country_code  c_i_child  income        pj     
South Africa  ZAF           100        82408.550000  0.65982    500
                            37         1340.842800   0.65982    500
                            27         956.198900    0.65982    500
                            28         988.318700    0.65982    500
                            29         1022.722000   0.65982    500
                                                               ... 
                            70         4057.197000   0.65982    500
                            71         4223.696000   0.65982    500
                            72         4413.778300   0.65982    500
                            73         4620.958000   0.65982    500
                            1          60.490383     0.65982    500
Length: 100, dtype: int64

In [25]:
start = time.time()
# prob = []
# pays = ['Yemen, Rep.', 'Albania']
nb_quantiles = 100       # nombre de quantiles (nombre de classes de revenu)
n  = 500*nb_quantiles   # taille de l'échantillon
# for pays in df_gini_pj_indSF.country.unique().tolist() :
for pays in df_gini_pj_indSF.country.unique() :
    c_i_parent = []
    prob = []

    
# mask
    y_child, y_parents = generate_incomes(n, df_gini_pj_indSF.loc[df_gini_pj_indSF['country'] == pays, 'pj'].values[0])

    sample = compute_quantiles(y_child, y_parents, nb_quantiles)
    df_gini_pj_indSF.loc[df_gini_pj_indSF['country'] == pays, ['c_i_parent']] = sample.c_i_parent.values

    cd = conditional_distributions(sample, nb_quantiles)
    liste_cd = cd.tolist()

    # cd correspondant
    for c_parent in np.arange(1,101, dtype="int8"):
    # p est une liste 
        p = liste_cd[c_parent-1] 

    # calcul des proba en fonction de la distribution conditionnelle (liste p)
        ci_parent = []
        for i in range(len(p)):
            nb_cl_parent = int(p[i]*500)
            nb_cl_parent.extend([i+1]*nb_cl_parent)
    #         ci_parent.append([i+1]*nb_cl_parent)

        # incorporation des valeurs dans le dataframe
        df_gini_pj_indSF.loc[(df_gini_pj_indSF["country"] == pays) 
                      & (df_gini_pj_indSF["c_i_child"] == c_parent), "c_i_parent"] = ci_parent


print((time.time()-start)/60)

# 0.16336220105489094 secondes pour 1 pays
# 41 minutes pour tous les pays avec "for pays in df_gini_pj_indSF.country.unique() :""!!!!
# 16 minutes  pour tous les pays avec "for pays in df_gini_pj_indSF.country.unique() :"" seul jupyter en fonctionnement 
# 18 min
# 16 min avec income float64

16.556054576237997


In [26]:
df_gini_pj_indSF = df_gini_pj_indSF.sort_values(['country', 'c_i_child'])

In [27]:
df_gini_pj_indSF.shape

(5800000, 6)

In [28]:
 df_gini_pj_indSF.to_csv('df_income_classe.csv')

In [29]:
#  Enregistrement du fichier "df_gini_pj_indSF" qui comprend à présent les classes parents
with open('df_gini_pj_indSF', 'wb') as f5:
    pickle.dump(df_gini_pj_indSF, f5)

In [30]:
# Import et lecture du fichier "df_gini_pj_indSF" 
with open('df_gini_pj_indSF', 'rb') as f5:
    df_gini_pj_indSF = pickle.load(f5)

10. Éventuellement et pour éviter toute confusion, effacez la variable c_i_child : nous n'en avons pas besoin pour la mission 4. 

In [31]:
df_gini_pj_indSF.head(1)

Unnamed: 0,country,country_code,c_i_child,income,pj,c_i_parent
0,Albania,ALB,1,728.89795,0.535604,1.0


In [32]:
df_inc_pj_cip = df_gini_pj_indSF.drop("c_i_child", axis=1)

In [33]:
df_inc_pj_cip.head(1)

Unnamed: 0,country,country_code,income,pj,c_i_parent
0,Albania,ALB,728.89795,0.535604,1.0


In [34]:
# matrice proba
pd.DataFrame(cd, index=np.arange(1,101,1), columns=np.arange(1,101,1))

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
1,0.164,0.104,0.084,0.050,0.054,0.038,0.034,0.042,0.030,0.030,...,0.000,0.002,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
2,0.082,0.058,0.056,0.060,0.050,0.036,0.036,0.024,0.038,0.032,...,0.000,0.000,0.000,0.002,0.000,0.000,0.000,0.000,0.000,0.000
3,0.068,0.048,0.042,0.040,0.044,0.036,0.038,0.034,0.026,0.036,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
4,0.034,0.038,0.042,0.046,0.032,0.036,0.030,0.038,0.032,0.022,...,0.002,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.002,0.000
5,0.064,0.046,0.028,0.048,0.028,0.028,0.028,0.028,0.030,0.022,...,0.000,0.000,0.000,0.002,0.000,0.000,0.000,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,0.000,0.000,0.000,0.002,0.000,0.002,0.000,0.000,0.000,0.000,...,0.012,0.020,0.028,0.034,0.048,0.028,0.046,0.044,0.034,0.052
97,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.002,0.000,...,0.048,0.028,0.036,0.032,0.032,0.048,0.048,0.026,0.042,0.056
98,0.000,0.000,0.000,0.000,0.002,0.000,0.000,0.002,0.000,0.000,...,0.028,0.012,0.026,0.038,0.024,0.046,0.034,0.034,0.052,0.074
99,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.030,0.022,0.046,0.038,0.036,0.038,0.052,0.044,0.072,0.094


11. Assurez-vous que votre nouvel échantillon contiennent bien les variables initialement présentes dans la World income distribution : mj et Gj .

#### Incorporation de la variable mj (revenu moyen du pays j) dans le dataframe "df_inc_pj_cip"

In [35]:
# Import et lecture du fichier "df_mj"
with open('df_mj', 'rb') as f3:
    df_mj = pickle.load(f3)

In [36]:
df_mj.shape

(116, 2)

In [37]:
df_mj.head(1)

Unnamed: 0,country_code,mj
0,ALB,2994.829902


In [38]:
# fusion pour incorporer le mj (revenu moyen du pays j) dans le dataframe principal
df_pj_inc_mj_cip = pd.merge(df_inc_pj_cip, df_mj, on='country_code', how='left')

In [39]:
df_pj_inc_mj_cip.head(1)

Unnamed: 0,country,country_code,income,pj,c_i_parent,mj
0,Albania,ALB,728.89795,0.535604,1.0,2994.829902


In [40]:
df_pj_inc_mj_cip.shape

(5800000, 6)

In [41]:
df_pj_inc_mj_cip.isnull().sum()

country         0
country_code    0
income          0
pj              0
c_i_parent      0
mj              0
dtype: int64

#### Incorporation de la variableGj (indice de Gini du pays j) dans le dataframe "df_pj_inc_mj_cip"

In [42]:
# Import et lecture du fichier "df_gini_pj_inc_mj" 
# Import et lecture du fichier "df_mj"
with open('df_gini_pj_inc_mj', 'rb') as f4:
    df_gini_pj_inc_mj = pickle.load(f4)

In [43]:
df_gini_pj_inc_mj.head(1)

Unnamed: 0,country,country_code,Gj,income,pj,quantile,mj
0,Albania,ALB,30.0,728.89795,0.535604,1,2994.829902


In [44]:
# création d'un dataframe pour la fusion pour récupérer le Gj (indice de gini du pays j)
df_gini = df_gini_pj_inc_mj[['country_code', 'Gj']]

In [45]:
df_gini.head(1)

Unnamed: 0,country_code,Gj
0,ALB,30.0


In [46]:
df_gini = df_gini.groupby(['country_code']).mean().reset_index()
df_gini.head(2)

Unnamed: 0,country_code,Gj
0,ALB,30.0
1,ARG,45.1


In [47]:
df_gini.shape

(116, 2)

In [48]:
df_pj_inc_mj_cip.shape

(5800000, 6)

In [49]:
# fusion pour incorporer le Gj (indice de gini du pays j) dans le dataframe principal
df_Gj_inc_pj_mj_cip = pd.merge(df_pj_inc_mj_cip, df_gini, on='country_code', how='left')

In [50]:
df_Gj_inc_pj_mj_cip.head(6)

Unnamed: 0,country,country_code,income,pj,c_i_parent,mj,Gj
0,Albania,ALB,728.89795,0.535604,1.0,2994.829902,30.0
1,Albania,ALB,728.89795,0.535604,1.0,2994.829902,30.0
2,Albania,ALB,728.89795,0.535604,1.0,2994.829902,30.0
3,Albania,ALB,728.89795,0.535604,1.0,2994.829902,30.0
4,Albania,ALB,728.89795,0.535604,1.0,2994.829902,30.0
5,Albania,ALB,728.89795,0.535604,1.0,2994.829902,30.0


In [54]:
ds1 = df_Gj_inc_pj_mj_cip.isin([np.inf, -np.inf]).sum()
ds1

country         0
country_code    0
income          0
pj              0
c_i_parent      0
mj              0
Gj              0
dtype: int64

In [51]:
df_Gj_inc_pj_mj_cip.isnull().sum()

country         0
country_code    0
income          0
pj              0
c_i_parent      0
mj              0
Gj              0
dtype: int64

In [52]:
#  Enregistrement du fichier final comprenant le mj et le Gj
with open('df_Gj_inc_pj_mj_cip', 'wb') as f6:
    pickle.dump(df_Gj_inc_pj_mj_cip, f6)

In [53]:
 df_Gj_inc_pj_mj_cip.to_csv('df_Gj_inc_pj_mj_cip.csv', index=False, encoding=ENCODAGE)