In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.decomposition import PCA
import statsmodels.api as sm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Load the dataset
df = pd.read_csv('Data/raw-data.csv')

# Display the shape of the dataset (rows, columns)
print(f'The dataset contains {df.shape[0]} observations and {df.shape[1]} features.')

The dataset contains 1561 observations and 309 features.


In [3]:
# # Display column names
print("\nColumn Names:")
print(df.columns)

# # Display data types of each feature
print("\nData Types of Features:")
print(df.dtypes)

# # Display summary statistics for numerical features
print("\nSummary Statistics:")
print(df.describe())

# # Check for missing values
print("\nMissing Values (if any):")
print(df.isnull().sum())


Column Names:
Index(['ID', 'RES', 'SETTYPE', 'INDWT', 'RESPSEX', 'RESPAGE', 'RATEHAP',
       'FEELTRU', 'FEELEMP', 'FEELCLS',
       ...
       'C2_R', 'C3_R', 'INTDATE', 'INTLANG', 'HHSIZE', 'HHASIZE', 'NPSUSS',
       'NHHPSU', 'NADHH', 'HHWT'],
      dtype='object', length=309)

Data Types of Features:
ID         float64
RES         object
SETTYPE     object
INDWT      float64
RESPSEX     object
            ...   
HHASIZE    float64
NPSUSS     float64
NHHPSU     float64
NADHH      float64
HHWT       float64
Length: 309, dtype: object

Summary Statistics:
                 ID        INDWT      RESPAGE          PSU       HHSIZE  \
count  1.561000e+03  1561.000000  1561.000000  1561.000000  1561.000000   
mean   9.880901e+06  1479.994234    52.803972   123.430493     3.201153   
std    5.262563e+05  1224.611751    18.034120    71.534673     1.864795   
min    9.094601e+06   247.726154    18.000000     1.000000     1.000000   
25%    9.397812e+06   656.555444    37.000000    62.000000 

In [4]:
# List of features to exclude
features_to_exclude = ['INDWT', 'ID', 'PSU', 'HHWT', 'NHHPSU', 'ATTEND_1']

# Drop the specified features from the dataset
df = df.drop(columns=features_to_exclude)

# Clean the target variable column
df['RATEHAP'] = df['RATEHAP'].replace({'Extremely unhappy': 1})
df['RATEHAP'] = df['RATEHAP'].replace({'Extremely happy': 10})
df = df[~df['RATEHAP'].isin(["Don't know", "Refuse to answer"])]
df = df.dropna(subset=['RATEHAP'])
df['RATEHAP'] = pd.to_numeric(df['RATEHAP'], errors='coerce')

# Verify the result by checking the first few rows of the cleaned dataset
print(df.head())

        RES  SETTYPE RESPSEX  RESPAGE  RATEHAP                 FEELTRU  \
0  Response  Capital    Male     27.0      7.0  More or less describes   
1  Response  Capital    Male     33.0     10.0               Describes   
2  Response  Capital    Male     26.0      5.0               Describes   
3  Response    Rural    Male     34.0      4.0  More or less describes   
4  Response  Capital    Male     21.0      6.0               Describes   

                  FEELEMP                 FEELCLS                 FEELRLY  \
0  More or less describes  More or less describes  More or less describes   
1       Does not describe       Does not describe               Describes   
2  More or less describes               Describes               Describes   
3       Does not describe  More or less describes       Does not describe   
4       Does not describe               Describes               Describes   

             FEELREJ  ... SUBSTRATUM               C1_R        C2_R  \
0  Does not describe 

In [5]:
def replace_survey_words(df):
    df = df.copy()

    description_mapping = {
        'Does not describe': 1,
        'More or less describes': 2,
        'Describes': 3
    }

    support_mapping = {
    "Don't support at all": 1,
    'Rather not support': 2,
    "Partially support, partially don't support": 3,
    'Rather support': 4,
    'Fully support': 5,
    'DK/RA': None
    }


    trust_mapping = {
        "You can't be too careful": 1,
        "Most people can be trusted": 5
    }

    lifesat_mapping = {
        'Not satisfied at all': 1,
        'Completely satisfied': 5,
        'DK/RA': None
    }

    fate_mapping = {
        'Everything in life is determined by fate': 1,
        'People shape their fate themselves': 5,
        'DK/RA': None
    }

    health_mapping = {
        'Very poor': 1,
        'Poor': 2,
        'Fair': 3,
        'Good': 4,
        'Very good': 5,
        'DK/RA': None
    }

    approval_mapping = {
        'Approve': 1,
        'Disapprove': 0,
        'DK/RA': None
    }

    yesno_mapping = {
        'Yes': 1,
        'No': 0,
        'DK/RA': None
    }

    internet_freq_mapping = {
        'Every day': 5,
        'At least once a week': 4,
        'At least once a month': 3,
        'Less often': 2,
        'Never': 1,
        "I don't know what the internet is": 0,
        'DK': None
    }

    internet_activity_mapping = {
        'Mentioned': 1,
        'Not mentioned': 0,
        'I do not use any': 0,
        'DK/RA': None
    }

    interest_mapping = {
        'Not at all interested': 1,
        'Hardly interested': 2,
        'Quite interested': 3,
        'Very interested': 4,
        'DK/RA': None
    }

    politics_direction_mapping = {
        'Politics is definitely going in the wrong direction': 1,
        'Politics is going mainly in the wrong direction': 2,
        'Politics does not change at all': 3,
        'Politics is going mainly in the right direction': 4,
        'Politics is definitely going in the right direction': 5,
        'DK/RA': None
    }

    agreement_mapping = {
        'Fully disagree': 1,
        'Rather disagree': 2,
        'Neither agree nor disagree': 3,
        'Rather agree': 4,
        'Fully agree': 5,
        'DK': None,
        'RA': None
    }

    trust_level_mapping = {
        'Fully distrust': 1,
        'Rather distrust': 2,
        'Neither trust nor distrust': 3,
        'Rather trust': 4,
        'Fully trust': 5,
        'DK/RA': None
    }

    critic_mapping = {
        'Criticize everyone': 1,
        'Agree with neither': 2,
        'Agree with both': 3,
        'Unaccaptable to criticize some': 4,
        'DK/RA': None
    }

    courts_mapping = {
        'Courts favor some citizens': 1,
        'Agree with neither': 2,
        'Agree with both': 3,
        'Courts treat all equally': 4,
        'DK': None,
        'RA': None
    }

    participation_mapping = {
    'People should participate': 1,
    'Agree with neither': 2,
    'Agree with both': 3,
    'People should not participate': 4,
    'DK/RA': None
    }

    eligibility_mapping = {
    'Yes': 1,
    'No': 0,
    'I was not eligible to participate/I was not yet 18': -1,
    'DK/RA': None
    }

    fairness_mapping = {
        'Not at all fairly': 1,
        'To some extent fairly': 2,
        'Completely fairly': 3,
        'DK/RA': None
    }

    participation_likelihood_mapping = {
        'Certainly not participate': 1,
        'Most probably not participate': 2,
        'Most probably participate': 3,
        'Certainly participate': 4,
        'DK/RA': None
    }

    support_mapping = {
        "Don't support at all": 1,
        'Rather not support': 2,
        "Partially support, partially don't support": 3,
        'Rather support': 4,
        'Fully support': 5,
        'DK/RA': None
    }

    democracy_mapping = {
        'Not a democracy': 1,
        'A democracy but with major problems': 2,
        'A democracy but with minor problems': 3,
        'A full democracy': 4,
        'DK/RA': None
    }

    democracy_preference_mapping = {
        'Democracy is preferable to any other kind of government': 1,
        'In some circumstances, a non-democratic government can be preferable': 2,
        'For someone like me, it doesn’t matter what kind of government system we have': 3,
        'DK': None,
        'RA': None
    }

    expectation_mapping = {
    'Positive': 1,
    'Negative': 2,
    'No expectations': 3,
    'Have not heard of such events': 4,
    'DK/RA': None
    }

    intensity_mapping = {
    'Completely': 4,
    'To some extent': 3,
    'A little bit': 2,
    'Not at all': 1,
    'DK/RA': None
    }

    conflict_mapping = {
    'Conflicts are an inseparable part of our life': 1,
    'Agree with neither': 2,
    'Agree with both': 3,
    'Our life is going on in endless conflicts': 4,
    'DK/RA': None
    }

    likelihood_mapping = {
    'Very unlikely': 1,
    'Rather unlikely': 2,
    'Rather likely': 3,
    'Very likely': 4,
    'DK/RA': None
    }

    time_expectation_mapping = {
    "In less than one year's time": 1,
    'In 1-4 years': 2,
    'In 5-10 years': 3,
    'In more than 10 years': 4,
    'DK': None,
    'RA': None
    }

    reconciliation_mapping = {
    'Sometimes it is necessary to forget what happened in the past between peoples': 1,
    'Agree with neither': 2,
    'Agree with both': 3,
    'It is impossible to forget what happened in the past between peoples': 4,
    'DK/RA': None
    }

    planning_horizon_mapping = {
        'Up to 1 year': 1,
        'Up to 3 years': 2,
        'Up to 5 years': 3,
        'Up to 10 years': 4,
        'More than 10 years': 5,
        'I prefer to live in the present moment': 6,
        'I am not able to plan the future': 7,
        'DK/RA': None
    }

    binary_description_mapping = {
        'Rather describes': 1,
        'Rather does not describe': 0,
        'DK/RA': None
    }

    future_outlook_mapping = {
        'Mainly for the worse': 1,
        'Will remain the same': 2,
        'Mainly for the better': 3,
        'DK': None,
        'RA': None
    }

    wish_mapping = {
        "Don't wish at all": 1,
        "Rather don't wish": 2,
        'Neither wish, nor don’t wish': 3,
        'Rather wish': 4,
        'Wish very much': 5,
        'DK/RA': None
    }

    expectation_strength_mapping = {
        'No expectation at all': 1,
        'Rather not expect': 2,
        'Neither expect, nor do not': 3,
        'Rather expect': 4,
        'Expect very much': 5,
        'DK/RA': None
    }

    strong_agreement_mapping = {
    'Strongly disagree': 1,
    'Disagree': 2,
    'Neither agree, nor disagree': 3,
    'Agree': 4,
    'Strongly agree': 5,
    'DK/RA': None
    }

    work_experience_mapping = {
    'Less than 5 years': 1,
    '6-10': 2,
    '11-15': 3,
    '16-20': 4,
    'Longer than 20 years': 5,
    'Never had a job': 0,
    'DK': None,
    'RA': None
    }

    income_mapping = {
    '0': 0,
    'Up to AMD 48,737': 1,
    'AMD 48,738 – 66,842': 2,
    'AMD 66,843 – 92,534': 3,
    'AMD 92,535 – 203,564': 4,
    'More than AMD 203,564': 5,
    'DK': None,
    'RA': None
    }

    education_mapping = {
    'Secondary or lower': 1,
    'Secondary technical': 2,
    'Higher than secondary': 3,
    'DK/RA': None
    }

    skill_level_mapping = {
        'No basic knowledge': 1,
        'Beginner': 2,
        'Intermediate': 3,
        'Advanced': 4,
        'DK/RA': None
    }

    religious_frequency_mapping = {
        'Once a week or more often': 5,
        'At least once a month': 4,
        'Only on special religious holidays': 3,
        'Less often': 2,
        'Never': 1,
        'DK/RA': None
    }

    fasting_mapping = {
    'Always fast': 5,
    'Often fast': 4,
    'Sometimes fast': 3,
    'Rarely fast': 2,
    'Never fast': 1,
    'Fasting is not required in my religion': 0,
    'DK/RA': None
    }

    importance_mapping = {
    'Not at all important': 1,
    'Not very important': 2,
    'Rather important': 3,
    'Very important': 4,
    'DK/RA': None
    }

    general_frequency_mapping = {
    'Every day': 5,
    'Every week': 4,
    'Every month': 3,
    'Less often': 2,
    'Never': 1,
    'DK/RA': None
    }

    count_mapping = {
        'Zero': 0,
        'One': 1,
        'Two': 2,
        'Three and more': 3
    }

    income_mapping = {
        '0': 0,
        'Up to AMD 48,737': 1,
        'AMD 48,738 – 66,842': 2,
        'AMD 66,843 – 92,534': 3,
        'AMD 92,535 – 203,564': 4,
        'More than AMD 203,564': 5,
        'DK': None,
        'RA': None
    }

    activity_frequency_mapping = {
        'Each week': 5,
        'Each month': 4,
        'Every other month': 3,
        'Less frequently': 2,
        'Never': 1,
        'DK/RA': None
    }

    ranking_mapping = {
        'Lowest': 1,
        '2': 2,
        '3': 3,
        '4': 4,
        'Highest': 5,
        'DK': None,
        'RA': None
    }

    rating_mapping = {
    'Very poor': 1,
    'Poor': 2,
    'Fair': 3,
    'Good': 4,
    'Very good': 5,
    'DK': None,
    'RA': None
    }

    interference_mapping = {
    'Yes, during the entire interview': 4,
    'Yes, most of the time': 3,
    'Yes, for less than half of the interview': 2,
    'Yes, for only a few questions': 1,
    'No': 0
    }

    interview_presence_mapping = {
    'Never': 0,
    'Less than half of the interview': 1,
    'Almost half the interview': 2,
    'More than half of the interview': 3,
    'Through the entire interview': 4
    }

    intelligence_mapping = {
    'Not at all intelligent': 1,
    'Not very intelligent': 2,
    'Average': 3,
    'Intelligent': 4,
    'Very intelligent': 5
    }

    sincerity_mapping = {
        'Not at all sincere': 1,
        '2': 2,
        '3': 3,
        '4': 4,
        'Completely sincere': 5
    }

    fatigue_mapping = {
    'Very tired': 1,
    '2': 2,
    '3': 3,
    '4': 4,
    'Not tired at all': 5
    }

    count_topcoded_mapping = {
        'Zero': 0,
        'One': 1,
        'Two': 2,
        'Three': 3,
        'Four or more': 4
    }

    count_extended_mapping = {
    'One': 1,
    'Two': 2,
    'Three': 3,
    'Four': 4,
    'Five': 5,
    'Six or more': 6
}

    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].replace(description_mapping)
            df[col] = df[col].replace(trust_mapping)
            df[col] = df[col].replace(lifesat_mapping)
            df[col] = df[col].replace(fate_mapping)
            df[col] = df[col].replace(health_mapping)
            df[col] = df[col].replace(approval_mapping)
            df[col] = df[col].replace(yesno_mapping)
            df[col] = df[col].replace(internet_freq_mapping)
            df[col] = df[col].replace(internet_activity_mapping)
            df[col] = df[col].replace(interest_mapping)
            df[col] = df[col].replace(politics_direction_mapping)
            df[col] = df[col].replace(agreement_mapping)
            df[col] = df[col].replace(trust_level_mapping)
            df[col] = df[col].replace(critic_mapping)
            df[col] = df[col].replace(courts_mapping)
            df[col] = df[col].replace(participation_mapping)
            df[col] = df[col].replace(eligibility_mapping)
            df[col] = df[col].replace(fairness_mapping)
            df[col] = df[col].replace(participation_likelihood_mapping)
            df[col] = df[col].replace(support_mapping)
            df[col] = df[col].replace(democracy_mapping)
            df[col] = df[col].replace(democracy_preference_mapping)
            df[col] = df[col].replace(expectation_mapping)
            df[col] = df[col].replace(intensity_mapping)
            df[col] = df[col].replace(conflict_mapping)
            df[col] = df[col].replace(likelihood_mapping)
            df[col] = df[col].replace(time_expectation_mapping)
            df[col] = df[col].replace(reconciliation_mapping)
            df[col] = df[col].replace(planning_horizon_mapping)
            df[col] = df[col].replace(binary_description_mapping)
            df[col] = df[col].replace(future_outlook_mapping)
            df[col] = df[col].replace(wish_mapping)
            df[col] = df[col].replace(expectation_strength_mapping)
            df[col] = df[col].replace(strong_agreement_mapping)
            df[col] = df[col].replace(work_experience_mapping)
            df[col] = df[col].replace(income_mapping)
            df[col] = df[col].replace(education_mapping)
            df[col] = df[col].replace(skill_level_mapping)
            df[col] = df[col].replace(religious_frequency_mapping)
            df[col] = df[col].replace(fasting_mapping)
            df[col] = df[col].replace(importance_mapping)
            df[col] = df[col].replace(general_frequency_mapping)
            df[col] = df[col].replace(count_mapping)
            df[col] = df[col].replace(income_mapping)
            df[col] = df[col].replace(activity_frequency_mapping)
            df[col] = df[col].replace(ranking_mapping)
            df[col] = df[col].replace(rating_mapping)
            df[col] = df[col].replace(interference_mapping)
            df[col] = df[col].replace(interview_presence_mapping)
            df[col] = df[col].replace(intelligence_mapping)
            df[col] = df[col].replace(sincerity_mapping)
            df[col] = df[col].replace(fatigue_mapping)
            df[col] = df[col].replace(count_topcoded_mapping)
            df[col] = df[col].replace(count_extended_mapping)
            df[col] = df[col].replace(support_mapping)

    return df

In [6]:
df = df.replace({
    "Don't know": np.nan,
    "DK": np.nan,
    "RA": np.nan,
    "DK/RA": np.nan
})

In [8]:
df.to_csv('clean_data.csv', index=False)