#  Data Preprocessing and Analysis

In [1]:
# Import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
names = ["Carol", "Kate", "Jane", "Kuda", "Tito", "Kuku"]
age = [23,np.nan,34,56,np.nan,44]

# Create data frame with lists above
df = pd.DataFrame({"Names": names, "Age": age})
df.head()

Unnamed: 0,Names,Age
0,Carol,23.0
1,Kate,
2,Jane,34.0
3,Kuda,56.0
4,Tito,


In [3]:
# Count of number of missing values

df.isnull().sum()

Names    0
Age      2
dtype: int64

In [4]:
# Copy data frame df
df_copy = df.copy()
df_copy

Unnamed: 0,Names,Age
0,Carol,23.0
1,Kate,
2,Jane,34.0
3,Kuda,56.0
4,Tito,
5,Kuku,44.0


In [5]:
# Dropping NA values

df_copy.dropna().reset_index(drop="index")

Unnamed: 0,Names,Age
0,Carol,23.0
1,Jane,34.0
2,Kuda,56.0
3,Kuku,44.0


In [6]:
# Fill in missing values with mean of values

df_copy_2 = df.copy()

df_copy_2.fillna(value=df_copy_2["Age"].mean(),inplace=True)
df_copy_2

Unnamed: 0,Names,Age
0,Carol,23.0
1,Kate,39.25
2,Jane,34.0
3,Kuda,56.0
4,Tito,39.25
5,Kuku,44.0


In [7]:
gender_values = ["F","F", "F", "M", "M", "M"]

#Update dataframe with gender_values
df_copy_2["Gender"] = gender_values
df_copy_2

Unnamed: 0,Names,Age,Gender
0,Carol,23.0,F
1,Kate,39.25,F
2,Jane,34.0,F
3,Kuda,56.0,M
4,Tito,39.25,M
5,Kuku,44.0,M


In [8]:
df_melted = df_copy_2.melt(id_vars=["Names"],
                                    value_name="values",
                                    var_name="Age_Gender")
df_melted

Unnamed: 0,Names,Age_Gender,values
0,Carol,Age,23.0
1,Kate,Age,39.25
2,Jane,Age,34.0
3,Kuda,Age,56.0
4,Tito,Age,39.25
5,Kuku,Age,44.0
6,Carol,Gender,F
7,Kate,Gender,F
8,Jane,Gender,F
9,Kuda,Gender,M


In [9]:
# Data frame of names and gender

names_gender_df = df_melted.where(df_melted["Age_Gender"] == "Gender").dropna()
names_gender_df 

Unnamed: 0,Names,Age_Gender,values
6,Carol,Gender,F
7,Kate,Gender,F
8,Jane,Gender,F
9,Kuda,Gender,M
10,Tito,Gender,M
11,Kuku,Gender,M


In [10]:
# Convert columns header to uppercase
names_gender_df.columns = names_gender_df.columns.str.upper()
names_gender_df

Unnamed: 0,NAMES,AGE_GENDER,VALUES
6,Carol,Gender,F
7,Kate,Gender,F
8,Jane,Gender,F
9,Kuda,Gender,M
10,Tito,Gender,M
11,Kuku,Gender,M
