In [None]:
#imports
import pandas as pd
import numpy as np
import polars as pl
from scipy.stats import pearsonr 
import random as rd

In [None]:
ess = pd.read_csv('ESS9e03_1.csv', low_memory = False)

In [None]:
ess.head(20)

In [None]:
france = ess[ess['cntry'] == 'FR']
france

In [None]:
fr_ess = france[['idno', 'polintr', 'trstprt', 'stflife', 'stfeco', 'sclmeet', 'agea']]
fr_ess = fr_ess[(fr_ess['polintr'] != 77) & (fr_ess['polintr'] != 88) & (fr_ess['polintr'] !=99)]
fr_ess = fr_ess[(fr_ess['trstprt'] != 77) & (fr_ess['trstprt'] != 88) & (fr_ess['trstprt'] !=99)]
fr_ess = fr_ess[(fr_ess['stflife'] != 77) & (fr_ess['stflife'] != 88) & (fr_ess['stflife'] !=99)]
fr_ess = fr_ess[(fr_ess['stfeco'] != 77) & (fr_ess['stfeco'] != 88) & (fr_ess['stfeco'] !=99)]
fr_ess = fr_ess[(fr_ess['sclmeet'] != 77) & (fr_ess['sclmeet'] != 88) & (fr_ess['sclmeet'] !=99)]
fr_ess = fr_ess[(fr_ess['agea'] != 999)]
fr_ess

In [None]:
fr_ess_subset_1 = fr_ess.iloc[:10, :]
fr_ess_subset_1

In [None]:
fr_ess_subset_2 = fr_ess.iloc[10:20, :]
fr_ess_subset_2 = fr_ess_subset_2.sort_values('agea')
fr_ess_subset_2

In [None]:
fr_ess_subset_3 = fr_ess.iloc[20:30, :]
fr_ess_subset_3

In [None]:
fr_ess_subset_4 = fr_ess.iloc[30:40, :]
fr_ess_subset_4

### Making MCAR missings in our data

In [None]:
np.random.seed(110)
mask = np.random.choice([True, False], size=tuple([len(fr_ess_subset_1), 1]))
matrix = np.full((len(fr_ess_subset_1), fr_ess_subset_1.shape[1] - 1), False)

matrix_new = np.concatenate((matrix, mask), axis=1)
matrix_new

In [None]:
mask[mask.all(1),-1] = 0

fr_ess_subset_1_MCAR = fr_ess_subset_1.mask(matrix_new)
fr_ess_subset_1_MCAR

In [None]:
fr_ess_subset_1_MCAR.corr()

### Making MAR missings in our data

In [None]:
np.random.seed(110)
mask = np.array([[False, False, False, False, False, False, False, True, True, True]])
mask = mask.reshape(-1, 1)
mask

In [None]:
matrix = np.full((len(fr_ess_subset_2), fr_ess_subset_2.shape[1] - 1), False)

matrix_new = np.concatenate((matrix[:, :1], mask, matrix[:, 1:]), axis=1) 
matrix_new

In [None]:
mask[mask.all(1),-1] = 0

fr_ess_subset_2_MAR = fr_ess_subset_2.mask(matrix_new)
fr_ess_subset_2_MAR

### Making MNAR missings in our data

In [None]:
np.random.seed(120)
mask = np.array([[False, False, False, False, False, True, True, True, True, True]])
mask = mask.reshape(-1, 1)
mask

In [None]:
matrix = np.full((len(fr_ess_subset_1), fr_ess_subset_1.shape[1] - 1), False)

matrix_new = np.concatenate((matrix, mask), axis=1)
matrix_new


In [None]:
#mask[mask.all(1),-1] = 0

fr_ess_subset_2_MNAR = fr_ess_subset_2.mask(matrix_new)
fr_ess_subset_2_MNAR

### Generation of matrixes and creation of dataset

In [None]:
########### MCAR ###############
empty_list = []
random_list = [rd.randint(1, 1900) for _ in range(1000)]
for n in random_list:
    item = fr_ess.iloc[n:n+10,:]
    mask = np.random.choice([True, False], size=tuple([len(item), 1]))
    matrix = np.full((len(item), item.shape[1] - 1), False)
    
    number = rd.randint(1, 7)
    matrix_new = np.concatenate((matrix[:, :number], mask, matrix[:, number:]), axis=1)
    
    MCAR_matrix = item.mask(matrix_new)    
    print(MCAR_matrix.to_numpy())
    empty_list.append([MCAR_matrix.to_numpy(), matrix_new])    
    


In [None]:
#here is a dataframe of matrixes with MCAR
result_1 = pd.DataFrame(empty_list, columns = ['Yobs', 'Ymis'])
result_1

In [None]:
result_1['MDM'] = 0 #MCAR = 0
result_1

In [None]:
########### MAR ###############
empty_list = []
counter = 0
random_list = [rd.randint(1, 1900) for _ in range(1000)] #list for subsampling from dataset
for n in random_list:
    item = fr_ess.iloc[n:n+10,:]
    m = rd.randint(0, 6) #random chosing for a column
    
    #manipulations for threshold
    lst = list(fr_ess.iloc[n:n+10,:].iloc[:, m].unique())
    if len(lst) > 4:
        lst_for_threshold = sorted(lst)[1:-1]
    else:
        lst_for_threshold = sorted(lst)
    
    threshold = rd.choice(lst_for_threshold) #depends on variable 
    probability_missing = [0.8, 0.9, 1.0]  #depends on the case (make random)
    
    if counter%2 == 0: #changing of > and < sign
        item['MissingProbability'] = np.where(item.iloc[:, m] > threshold, rd.choice(probability_missing), 0)
        item['MissingProbability_2'] = np.where(item.iloc[:, m] > threshold, 1 - item['MissingProbability'], 1 - item['MissingProbability'])
        item['MissingProbability_3'] = item.apply(lambda row: tuple([row['MissingProbability'], row['MissingProbability_2']]), axis=1)
    else:
        item['MissingProbability'] = np.where(item.iloc[:, m] < threshold, rd.choice(probability_missing), 0)
        item['MissingProbability_2'] = np.where(item.iloc[:, m] < threshold, 1 - item['MissingProbability'], 1 - item['MissingProbability'])
        item['MissingProbability_3'] = item.apply(lambda row: tuple([row['MissingProbability'], row['MissingProbability_2']]), axis=1)
    
    item['new'] = item['MissingProbability_3'].apply(lambda row: rd.choices([True, False], weights=row, k=1)[0])
    
    number = rd.randint(1, 6) #chose a number for a column with missings
    while m == number: #check the inequality of number of column with missings and the column which determines the probability of missed data
        number = rd.randint(1, 6) 
       
    matrix = np.full((len(item), item.shape[1] - 5), False)
    print(matrix)
    matrix_new = np.concatenate((matrix[:, :number], item[['new']], matrix[:, number:]), axis=1)
    print(matrix_new)
    print("________________________________")
    
    print(item)
    
    item.iloc[:, number] = np.where(item['new'].to_numpy(), np.nan, item.iloc[:, number])
    
    print(item)
    
    MAR_matrix = item.drop(columns=['MissingProbability', 'MissingProbability_2', 'MissingProbability_3', 'new'])
    
    print(MAR_matrix)
    
    print(matrix_new)
 
    empty_list.append([MAR_matrix.to_numpy(), matrix_new])    
    


In [None]:
#here is a dataframe of matrixes with MAR
result_2 = pd.DataFrame(empty_list, columns = ['Yobs', 'Ymis'])
result_2['MDM'] = 1  #MAR = 1
result_2

In [None]:
########### MNAR ###############
empty_list = []
counter = 0
random_list = [rd.randint(1, 1900) for _ in range(1000)] #list for subsampling from dataset
for n in random_list:
    item = fr_ess.iloc[n:n+10,:]
    m = rd.randint(1, 6) #random chosing for a column
    
    #manipulations for threshold
    lst = list(fr_ess.iloc[n:n+10,:].iloc[:, m].unique())
    if len(lst) > 4:
        lst_for_threshold = sorted(lst)[1:-1]
    else:
        lst_for_threshold = sorted(lst)
    
    threshold = rd.choice(lst_for_threshold) #depends on variable 
    probability_missing = [0.8, 0.9, 1.0]  #depends on the case (make random)
    
    if counter%2 == 0: #changing of > and < sign
        item['MissingProbability'] = np.where(item.iloc[:, m] > threshold, rd.choice(probability_missing), 0)
        item['MissingProbability_2'] = np.where(item.iloc[:, m] > threshold, 1 - item['MissingProbability'], 1 - item['MissingProbability'])
        item['MissingProbability_3'] = item.apply(lambda row: tuple([row['MissingProbability'], row['MissingProbability_2']]), axis=1)
    else:
        item['MissingProbability'] = np.where(item.iloc[:, m] < threshold, rd.choice(probability_missing), 0)
        item['MissingProbability_2'] = np.where(item.iloc[:, m] < threshold, 1 - item['MissingProbability'], 1 - item['MissingProbability'])
        item['MissingProbability_3'] = item.apply(lambda row: tuple([row['MissingProbability'], row['MissingProbability_2']]), axis=1)
    
    item['new'] = item['MissingProbability_3'].apply(lambda row: rd.choices([True, False], weights=row, k=1)[0])
    
    number = m  #chose a number for a column with missings
    
    matrix = np.full((len(item), item.shape[1] - 5), False)
    matrix_new = np.concatenate((matrix[:, :number], item[['new']], matrix[:, number:]), axis=1)
    
    item.iloc[:, number] = np.where(item['new'].to_numpy(), np.nan, item.iloc[:, number])
    
    MAR_matrix = item.drop(columns=['MissingProbability', 'MissingProbability_2', 'MissingProbability_3', 'new'])
 
    empty_list.append([MAR_matrix.to_numpy(), matrix_new])    
    


In [None]:
#here is a dataframe of matrixes with MNAR
result_3 = pd.DataFrame(empty_list, columns = ['Yobs', 'Ymis'])
result_3['MDM'] = 2  #MNAR = 2
result_3

### Final dataframe with three missing data mechanisms

In [None]:
cnct_df = pd.concat([result_1, result_2, result_3]).reset_index() #result_3
cnct_df