In [None]:
# import libraries
import pandas as pd 
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
import warnings

# read csv
df = pd.read_csv('gs://divg-groovyhoon-pr-d2eab4-default/projectpro/processed_telecom_offer_data.csv')

# show dataframe
df.head()

In [None]:
# Split dataframe to a training and serving dataset
def training_serving_split(df): 
    training = df[df['offer'] != 'No Offer'] 
    serving = df[df['offer'] == 'No Offer'] 
    return training, serving

df_training, df_serving = training_serving_split(df)

print(df_training.shape, df_serving.shape)

In [None]:

# Columns that represent the customer ID and the business outcomes
id_variables = ['Customer ID', 'Month','Month of Joining','offer','Churn Category',
       'Churn Reason', 'Customer Status', 'Churn Value']

# Columns that represent different profiles of customers
profile_variables = ['Customer ID', 'Month', 'Month of Joining', 'Gender', 'Age',
                      'Married', 'Number of Dependents', 'area_codes','roam_ic', 'roam_og',
                      'loc_og_t2t','loc_og_t2m', 'loc_og_t2f', 'loc_og_t2c', 'std_og_t2t', 'std_og_t2m',
                      'std_og_t2f', 'std_og_t2c', 'isd_og', 'spl_og', 'og_others',
                      'loc_ic_t2t', 'loc_ic_t2m', 'loc_ic_t2f', 'std_ic_t2t', 'std_ic_t2m',
                      'std_ic_t2f', 'std_ic_t2o', 'spl_ic', 'isd_ic', 'ic_others',
                      'total_rech_amt', 'total_rech_data', 'vol_4g', 'vol_5g', 'arpu_5g',
                      'arpu_4g', 'arpu', 'aug_vbc_5g','Number of Referrals', 'Phone Service',
                      'Multiple Lines', 'Internet Service', 'Internet Type',
                      'Streaming Data Consumption', 'Online Security', 'Online Backup',
                      'Device Protection Plan', 'Premium Tech Support', 'Streaming TV',
                      'Streaming Movies', 'Streaming Music', 'Unlimited Data',
                      'Payment Method']

df_training_id = df_training[id_variables]
df_training_feat = df_training[profile_variables]

df_serving_id = df_serving[id_variables]
df_serving_feat = df_serving[profile_variables]

In [None]:
# Calculate customer's tenure for df_training_feat 
df_training_feat['tenure'] = df_training_feat['Month']- df_training_feat['Month of Joining']
df_training_feat['tenure'].describe()


# Calculate customer's tenure for df_serving_feat
df_serving_feat['tenure'] = df_serving_feat['Month']- df_serving_feat['Month of Joining']
df_serving_feat['tenure'].describe()



In [None]:
cat_feat = df_training.select_dtypes(include=['object', 'category']).columns.tolist()
print(cat_feat)

In [None]:
# Now we need to transform the features of the feature store.
def encode_categorical_features(df_training, df_serving):
    
    # Get a list of all categorical columns
    cat_columns = df_training.select_dtypes(include=['object', 'category']).columns.tolist()

    # Encode each categorical column
    for col in cat_columns:
        le = LabelEncoder()
        df_training[col] = le.fit_transform(df_training[col])
        df_serving[col]= le.transform(df_serving[col])
        
    return df_training, df_serving

In [None]:
# excluding the customer ID so it doesn't get encoded
training_labels=df_training_feat[df_training_feat.columns.difference(['Customer ID','Month','Month of Joining'])]
serving_labels=df_serving_feat[df_serving_feat.columns.difference(['Customer ID','Month','Month of Joining'])]

training_feat_encoded, serving_feat_encoded = encode_categorical_features(training_labels,serving_labels)

print(training_feat_encoded.columns)
print(serving_feat_encoded.columns)

In [None]:
# re-attach customer id variables for both training_feat_encoded
training_feat_encoded['Customer ID'] = df_training_feat['Customer ID'] 
training_feat_encoded['Month'] = df_training_feat['Month'] 
training_feat_encoded['Month of Joining'] = df_training_feat['Month of Joining'] 

# re-attach the customer id variables for both serving_feat_encoded
serving_feat_encoded['Customer ID'] = df_serving_feat['Customer ID'] 
serving_feat_encoded['Month'] = df_serving_feat['Month'] 
serving_feat_encoded['Month of Joining'] = df_serving_feat['Month of Joining'] 

# merge
df_training_final = pd.merge(training_feat_encoded, df_training_id[['Customer ID','Month','Month of Joining','Churn Value','offer']],how = 'inner',on=['Customer ID','Month','Month of Joining'])
df_serving_final = pd.merge(serving_feat_encoded, df_serving_id[['Customer ID','Month','Month of Joining','Churn Value','offer']],how = 'inner',on=['Customer ID','Month','Month of Joining'])

df_training_final.head()