# Can We Predict Churn Rate?: The Lifeline of Startups and Their Customers

In [32]:
import streamlit as st
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import types
import random


In [33]:
# Step 1: Title and Intro
st.title('Can We Predict Churn Rate?: The Lifeline of Startups and Their Customers')
st.write("This app uses a logistics regression for KKBox's Churn Prediction Challenge.")

In [34]:
def my_hash_func(*args, **kwargs):
    return hash(args + tuple(kwargs.items()))

@st.cache(hash_funcs={types.FunctionType: my_hash_func})
def load_data():
    data = pd.read_csv('/Users/halmorishima/Github/CapstonePJT-KKBox_Churn_Prediction/05_Streamlit/data/train_cleaned_st_deploy.csv')
    return data

data = load_data()

2023-09-07 04:29:42.546 `st.cache` is deprecated. Please use one of Streamlit's new caching commands,
`st.cache_data` or `st.cache_resource`.

More information [in our docs](https://docs.streamlit.io/library/advanced-features/caching).


In [35]:
data.head()

Unnamed: 0,msno,is_churn,bd,registration_init_time,payment_plan_days,is_auto_renew,is_cancel,msno_count,city_agg_5.0,city_agg_13.0,...,payment_method_id_agg_38.0,payment_method_id_agg_39.0,payment_method_id_agg_40.0,payment_method_id_agg_41.0,payment_method_id_agg_Other,is_discount,discount_amount,avg_play_time,skip_tendency,membership_period
0,1,1,20,20131223.0,30,0,0,6.0,0,1,...,0,0,0,0,0,0,0.0,218.328406,0.0,17266
1,2,1,18,20131227.0,90,0,0,20.0,0,1,...,0,0,0,0,1,0,0.0,122.181022,0.430631,17331
2,3,1,35,20140125.0,30,1,0,15.0,0,1,...,0,0,0,1,0,0,0.0,243.288277,0.009128,17426
3,4,1,0,20140126.0,30,1,1,4.0,0,0,...,0,0,1,0,0,0,0.0,206.340102,0.122449,17238
4,5,1,28,20140202.0,410,0,0,31.0,0,0,...,1,0,0,0,0,0,0.0,249.213192,0.066838,17663


In [36]:
# Preparing the data
msno = data['msno']
X = data.drop(['msno', 'is_churn'], axis=1)
y = data['is_churn']

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test, msno_train, msno_test = train_test_split(X, y, msno, test_size=0.2, random_state=42)

# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Training the model
model = LogisticRegression(penalty='l2', C=0.1, random_state=42, max_iter=1000)
model.fit(X_train_scaled, y_train)

# Creating a button to calculate probabilities
if st.button('Calculate Probabilities'):
    # Predicting probabilities on the test set
    y_proba = model.predict_proba(X_test_scaled)
    
    # Storing the results in a dataframe
    result_df = pd.DataFrame({'msno': msno_test, 'is_churn_proba': y_proba[:, 1]})