In [2]:
import pandas as pd
import json

In [3]:
# Function to preprocess data
def preprocess_data(df):
    # Handling missing values in string columns
    for col in df.columns:
        if df[col].dtype == 'object':
            # Filling missing values with a placeholder
            df[col].fillna("Missing", inplace=True)
    return df

# Function to create mappings for string columns
def create_mappings(df):
    mappings = {}
    for col in df.columns:
        if df[col].dtype == 'object':
            unique_values = df[col].unique()
            mapping = {v: i for i, v in enumerate(unique_values)}
            mappings[col] = mapping
    return mappings

# Function to apply mappings to the dataframe
def apply_mappings(df, mappings):
    for col, mapping in mappings.items():
        df[col] = df[col].map(mapping)
    return df

In [4]:
file_path = './pre_processed_data.csv' 
df = pd.read_csv(file_path)

In [5]:
preprocessed_df = preprocess_data(df)

In [6]:
all_mappings = create_mappings(preprocessed_df)


In [7]:
numeric_df = apply_mappings(preprocessed_df, all_mappings)

In [8]:
output_csv_path = './numeric_data.csv' 
numeric_df.to_csv(output_csv_path, index=False)

In [9]:
output_json_path = './mappings.json'
with open(output_json_path, 'w') as json_file:
    json.dump(all_mappings, json_file, indent=4)

In [10]:
all_mappings

{'Device_type': {'Smartphone': 0, 'Laptop/PC': 1, 'Tablet': 2},
 'Language': {'English': 0, 'Spanish': 1},
 'Smartphone_user': {'Yes, I have a smartphone': 0,
  'No, I do not have a smartphone': 1,
  'Refused': 2},
 'Social_media_user': {'Yes, I use social media sites': 0,
  'No, I do not use social media sites': 1,
  'Refused': 2,
  'Missing': 3},
 'Videogame_player': {'No, I do not play video games': 0,
  'Yes, I play video games': 1,
  'Refused': 2},
 'TS_on_smartphone': {'Too much time': 0,
  'About the right amount of time': 1,
  'Missing': 2,
  'Too little time': 3,
  'Refused': 4},
 'TS_on_socialmedia': {'About the right amount of time': 0,
  'Too little time': 1,
  'Too much time': 2,
  'Missing': 3,
  'Refused': 4},
 'TS_on_videogames': {'Missing': 0,
  'Too little time': 1,
  'About the right amount of time': 2,
  'Too much time': 3,
  'Refused': 4},
 'Kids_at_home': {'Yes': 0, 'No': 1, 'Refused': 2, 'Missing': 3},
 'How_is_curret_days_parenting': {'Harder than it was 20 year

In [11]:
numeric_df

Unnamed: 0,Device_type,Language,Smartphone_user,Social_media_user,Videogame_player,TS_on_smartphone,TS_on_socialmedia,TS_on_videogames,No_of_kids_0_4,No_of_kids_5_11,...,P_age_categories,P_gender,P_education_level,P_race,P_nationality,P_marital_status,P_supporting_party,P_income,P_employment_type,P_neighborhood
0,0,0,0,0,0,0,0,0,2.0,1.0,...,0,0,0,0,0,0,0,250000,0,0
1,1,0,0,0,1,0,1,1,0.0,1.0,...,0,0,0,0,0,0,0,90000,1,1
2,0,0,0,0,1,0,2,2,1.0,2.0,...,0,0,1,0,0,0,1,70000,0,1
3,1,0,0,1,1,1,3,2,1.0,2.0,...,0,1,2,1,0,0,2,250000,2,1
4,0,0,0,0,1,1,0,2,0.0,2.0,...,0,1,3,0,0,0,3,250000,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3635,0,1,0,0,1,0,2,1,1.0,0.0,...,4,0,0,3,2,0,1,0,2,1
3636,0,0,0,1,1,1,3,2,0.0,0.0,...,0,0,3,3,0,4,3,175000,2,1
3637,0,1,0,0,0,3,1,0,0.0,1.0,...,2,1,2,3,1,0,3,70000,2,1
3638,2,0,0,3,1,1,3,2,2.0,0.0,...,0,0,2,1,0,0,1,30000,0,2
