In [None]:
# import libraries
import pandas as pd 
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
import warnings

# read csv
df = pd.read_csv('gs://divg-groovyhoon-pr-d2eab4-default/projectpro/processed_telecom_offer_data.csv')

# show dataframe
df.head()

In [None]:
# Split dataframe to a training and serving dataset
def training_serving_split(df): 
    training = df[df['offer'] != 'No Offer'] 
    serving = df[df['offer'] == 'No Offer'] 
    return training, serving

df_training, df_serving = training_serving_split(df)

print(df_training.shape, df_serving.shape)

In [None]:

# Columns that represent the customer ID and the business outcomes
id_variables = ['Customer ID', 'Month','Month of Joining','offer','Churn Category',
       'Churn Reason', 'Customer Status', 'Churn Value']

# Columns that represent different profiles of customers
profile_variables = ['Customer ID', 'Month', 'Month of Joining', 'Gender', 'Age',
                      'Married', 'Number of Dependents', 'area_codes','roam_ic', 'roam_og',
                      'loc_og_t2t','loc_og_t2m', 'loc_og_t2f', 'loc_og_t2c', 'std_og_t2t', 'std_og_t2m',
                      'std_og_t2f', 'std_og_t2c', 'isd_og', 'spl_og', 'og_others',
                      'loc_ic_t2t', 'loc_ic_t2m', 'loc_ic_t2f', 'std_ic_t2t', 'std_ic_t2m',
                      'std_ic_t2f', 'std_ic_t2o', 'spl_ic', 'isd_ic', 'ic_others',
                      'total_rech_amt', 'total_rech_data', 'vol_4g', 'vol_5g', 'arpu_5g',
                      'arpu_4g', 'arpu', 'aug_vbc_5g','Number of Referrals', 'Phone Service',
                      'Multiple Lines', 'Internet Service', 'Internet Type',
                      'Streaming Data Consumption', 'Online Security', 'Online Backup',
                      'Device Protection Plan', 'Premium Tech Support', 'Streaming TV',
                      'Streaming Movies', 'Streaming Music', 'Unlimited Data',
                      'Payment Method']

df_training_id = df_training[id_variables]
df_training_feat = df_training[profile_variables]

df_serving_id = df_serving[id_variables]
df_serving_feat = df_serving[profile_variables]

In [None]:
# Calculate customer's tenure for df_training_feat 
df_training_feat['tenure'] = df_training_feat['Month']- df_training_feat['Month of Joining']
df_training_feat['tenure'].describe()


# Calculate customer's tenure for df_serving_feat
df_serving_feat['tenure'] = df_serving_feat['Month']- df_serving_feat['Month of Joining']
df_serving_feat['tenure'].describe()



In [None]:
cat_feat = df_training.select_dtypes(include=['object', 'category']).columns.tolist()
print(cat_feat)

In [None]:
# Now we need to transform the features of the feature store.
def encode_categorical_features(df_training, df_serving):
    
    # Get a list of all categorical columns
    cat_columns = df_training.select_dtypes(include=['object', 'category']).columns.tolist()

    # Encode each categorical column
    for col in cat_columns:
        le = LabelEncoder()
        df_training[col] = le.fit_transform(df_training[col])
        df_serving[col]= le.transform(df_serving[col])
        
    return df_training, df_serving

In [None]:
# excluding the customer ID so it doesn't get encoded
training_labels=df_training_feat[df_training_feat.columns.difference(['Customer ID','Month','Month of Joining'])]
serving_labels=df_serving_feat[df_serving_feat.columns.difference(['Customer ID','Month','Month of Joining'])]

training_feat_encoded, serving_feat_encoded = encode_categorical_features(training_labels,serving_labels)

print(training_feat_encoded.columns)
print(serving_feat_encoded.columns)

In [None]:
# re-attach customer id variables for both training_feat_encoded
training_feat_encoded['Customer ID'] = df_training_feat['Customer ID'] 
training_feat_encoded['Month'] = df_training_feat['Month'] 
training_feat_encoded['Month of Joining'] = df_training_feat['Month of Joining'] 

# re-attach the customer id variables for both serving_feat_encoded
serving_feat_encoded['Customer ID'] = df_serving_feat['Customer ID'] 
serving_feat_encoded['Month'] = df_serving_feat['Month'] 
serving_feat_encoded['Month of Joining'] = df_serving_feat['Month of Joining'] 

# merge
df_training_final = pd.merge(training_feat_encoded, df_training_id[['Customer ID','Month','Month of Joining','Churn Value','offer']], how='inner',on=['Customer ID','Month','Month of Joining'])
df_serving_final = pd.merge(serving_feat_encoded, df_serving_id[['Customer ID','Month','Month of Joining','Churn Value','offer']], how='inner',on=['Customer ID','Month','Month of Joining'])

df_training_final.head()

In [None]:
# export final datasets to csv
df_training_final.to_csv('gs://divg-groovyhoon-pr-d2eab4-default/projectpro/df_training_final.csv', index=False)
df_serving_final.to_csv('gs://divg-groovyhoon-pr-d2eab4-default/projectpro/df_serving_final.csv', index=False)

In [None]:
df_training_final.head()

In [None]:
df = df_training_final
id_cols=['Customer ID','Month','Month of Joining','Churn Value','offer']
df_id = df_training_final[id_cols] 
customer_id = 'pnmkvvmrglxpm41'
month = 13 
distance_func = 'cosine'
n = 2000
minimal_threshold = 0.10
max_offers_to_return = 3

# offers = get_recommended_offers (train, train[id_cols], customer_id, month,distance_func,n,minimal_threshold,max_offers_to_return)

In [None]:
# extract the feature vectors of all customers
features = list(df_training_final.columns.difference(['Customer ID','Month','Month of Joining','offer']))
X = df_training_final[features].values

print(X)

In [None]:
# extract the feature vector of a given customer
index = df_training_final[(df_training_final['Customer ID'] == customer_id) & (df_training_final['Month']==month)].index[0]
x = X[index]

print(x)

In [None]:
distances = cosine_similarity(X, x.reshape(1, -1)).flatten()

print(np.min(distances), np.max(distances))
print(distances)

In [None]:

# find the indices of the n customers with lowest distance
most_similar_indices = distances.argsort()[:n]

print(distances[24], distances[2198], distances[59314], distances[125555], distances[113914])

print(most_similar_indices)

In [None]:

# extract the customer data for the most similar customers
similar_customers = df_training_final.iloc[most_similar_indices]

similar_customers.head()

# similar_customers.to_csv('gs://divg-groovyhoon-pr-d2eab4-default/projectpro/similar_customers.csv', index=False)


In [None]:
# merge with the id dataframe to select only the customers who did not churn
similar_customers = pd.merge(similar_customers,df_id[['Customer ID','Month of Joining','Month','Churn Value']],on=['Customer ID','Month of Joining','Month','Churn Value'])

# select the customers that did not churn
similar_customers = similar_customers[similar_customers['Churn Value']==0]

similar_customers.head()

In [None]:
# count the top offers of the non-churned customers
top_offers = similar_customers[['Customer ID','offer']].groupby(['offer']).agg({'Customer ID':'count'}).reset_index().sort_values(by = 'Customer ID', ascending = False)
top_offers.rename(columns={'Customer ID': 'customer_count'}, inplace=True)
top_offers['perc_total'] = top_offers['customer_count']/top_offers['customer_count'].sum()

print(top_offers)



In [None]:
top_offers_min = top_offers[top_offers['perc_total']>minimal_threshold].head(max_offers_to_return)

top_offers_min['offer'].unique()


In [None]:
def get_recommended_offers (df:pd.DataFrame, 
                            df_id:pd.DataFrame,
                            customer_id:str,
                            month:int,
                            distance_func:str,
                            n,
                            minimal_threshold:float,
                            max_offers_to_return:int
                            ):
    """
    Input parameters: 
    1. df: The original DataFrame
    2. df_id: The customer identifiers (Customer ID and Month) of customers for whom we want to make an offer
    3. distance_func: The distance function to calculate similarity score (choose from 'euclidean', 'manhattan', or 'cosine')
    4. n: The number of similar customers we want to base our recommendations on (e.g. 100 or 1000)
    5. minimal_threshold: The minimal threshold of offer score 
    6. max_offers_to_return: The number of top offers to recommend to the customer
    
    Output parameters: 
    A list of offers to recommend to the specified customer
    """

    # extract the feature vectors of all customers
    features = list(df.columns.difference(['Customer ID','Month','Month of Joining','offer']))
    X = df[features].values

    # extract the feature vector of the given customer
    index = df[(df['Customer ID'] == customer_id) & (df['Month']==month)].index[0]
    x = X[index]

    # compute the distances between the feature vectors
    if distance_func == 'euclidean':
      distances = euclidean_distances(X, x.reshape(1, -1)).flatten()
    elif distance_func == 'manhattan':
      distances = manhattan_distances(X, x.reshape(1, -1)).flatten()
    elif distance_func == 'cosine':
      distances = 1 - cosine_similarity(X, x.reshape(1, -1)).flatten()
    else:
      raise ValueError('Invalid distance function specified.')

    # find the indices of the n customers with lowest distance
    most_similar_indices = distances.argsort()[:n]
            
    # extract the customer data for the most similar customers
    similar_customers = df.iloc[most_similar_indices]

    # merge with the id dataframe to select only the customers who did not churn
    similar_customers = pd.merge(similar_customers,df_id[['Customer ID','Month of Joining','Month','Churn Value']],on=['Customer ID','Month of Joining','Month','Churn Value'])

    # select the customers that did not churn
    similar_customers = similar_customers[similar_customers['Churn Value']==0]

    # count the top offers of the non-churned customers
    top_offers = similar_customers[['Customer ID','offer']].groupby(['offer']).agg({'Customer ID':'count'}).reset_index().sort_values(by = 'Customer ID', ascending = False)    
    top_offers['perc_total'] = top_offers['Customer ID']/top_offers['Customer ID'].sum()
    top_offers_min = top_offers[top_offers['perc_total']>minimal_threshold].head(max_offers_to_return)
        
    return top_offers_min['offer'].unique()

In [None]:
df_training_final[df_training_final['Customer ID']=='nfhyvxuubmklt23']

In [None]:
df_training_final.columns

In [None]:
customer_id = 'nfhyvxuubmklt23' 
month = 6
distance_func = 'cosine'
n = 1000
minimal_threshold= 0.10
max_offers_to_return = 5
id_cols=['Customer ID','Month','Month of Joining','Churn Value','offer']

offers = get_recommended_offers(df_training_final, df_training_final[id_cols], customer_id, month, distance_func, n, minimal_threshold, max_offers_to_return)

print('The first offer to recommend is ' + str(offers[0]))
print('The second offer to recommend is ' + str(offers[1]))
print('The third offer to recommend is ' + str(offers[2]))


In [None]:
df_training_final[df_training_final['Customer ID']=='hlknupiesduin44']

In [None]:
customer_id = 'hlknupiesduin44' 
month = 5
distance_func = 'euclidean'
n = 1000
minimal_threshold= 0.10
max_offers_to_return = 3
id_cols=['Customer ID','Month','Month of Joining','Churn Value','offer']

offers = get_recommended_offers(df_training_final, df_training_final[id_cols], customer_id, month, distance_func, n, minimal_threshold, max_offers_to_return)

print('The first offer to recommend is ' + str(offers[0]))
print('The second offer to recommend is ' + str(offers[1]))
print('The third offer to recommend is ' + str(offers[2]))

In [None]:
df_training_final[df_training_final['Customer ID']=='hvazrzbwvnhru233']

In [None]:
customer_id = 'hvazrzbwvnhru233' 
month = 7
distance_func = 'manhattan'
n = 1000
minimal_threshold= 0.10
max_offers_to_return = 3
id_cols=['Customer ID','Month','Month of Joining','Churn Value','offer']

offers = get_recommended_offers(df_training_final, df_training_final[id_cols], customer_id, month, distance_func, n, minimal_threshold, max_offers_to_return)

print('The first offer to recommend is ' + str(offers[0]))
print('The second offer to recommend is ' + str(offers[1]))
print('The third offer to recommend is ' + str(offers[2]))

In [None]:
df_training_final['Customer ID'][200:250]