# Preprocessing

In [2]:
import pandas as pd
import plotly.graph_objects as go

from sklearn.preprocessing import MinMaxScaler

In [3]:
df = pd.read_csv("../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [4]:
df.loc[df.TotalCharges == " ", "TotalCharges"] = 0

In [5]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"])

In [6]:
numerical = ["tenure", "MonthlyCharges", "TotalCharges"]
columns = list(df.columns)
categorical = list(set(columns) - set(numerical))
categorical.remove("customerID")

In [7]:
mappings = []
for col in categorical:
    cats = df[col].unique()
    enums = [i for i in range(len(cats))]
    mapping = {cat: enum for cat, enum in zip(cats, enums)}
    mappings.append(mapping)

In [8]:
enum_df = pd.DataFrame()
enum_df["customerID"] = df["customerID"]
for num_col in numerical:
    enum_df[num_col] = df[num_col]
for cat_col, mapping in zip(categorical, mappings):
    enum_df[cat_col] = df[cat_col].map(mapping)

In [9]:
enum_df.to_csv("../data/processed/churn_enum.csv", index=False)

In [10]:
numerical_df = enum_df[numerical]
scaled_df = MinMaxScaler().fit_transform(numerical_df)
scaled_df.shape
enum_df["tenure"] = scaled_df[:, 0]
enum_df["MonthlyCharges"] = scaled_df[:, 1]
enum_df["TotalCharges"] = scaled_df[:, 2]

In [11]:
enum_df.to_csv("../data/processed/churn_enum_normalized.csv", index=False)

In [12]:
df.to_csv("../data/processed/churn.csv", index=False)