# Telco Churn Prediction

This notebook uses the Telco Customer Churn dataset from Kaggle. The goal of this notebook was to practice d3.js for data visualizations and to see if customer churn can be predicted. 

Code Referenced:
- https://medium.com/@stallonejacob/d3-in-juypter-notebook-685d6dca75c8
- Interactive Data Visualization for the Web, 2nd Edition
- https://www.d3-graph-gallery.com/graph/histogram_basic.html
- https://www.d3-graph-gallery.com/graph/histogram_tooltip.html

In [149]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json

from IPython.core.display import HTML, display
from string import Template

In [150]:
df = pd.read_csv('telco.csv')
data = df.to_dict(orient='records')

# Visualization

In [76]:
js_template = Template('''
var margin = {top: 10, right: 30, bottom: 30, left: 40},
        width = 460 - margin.left - margin.right,
        height = 400 - margin.top - margin.bottom;


    var svg = d3.select("#$graphdiv")
      .append("svg")
        .attr("width", width + margin.left + margin.right)
        .attr("height", height + margin.top + margin.bottom)
      .append("g")
        .attr("transform",
              "translate(" + margin.left + "," + margin.top + ")");

    d3.csv($telco)
      .then(function(data) {
        var x = d3.scaleLinear()
            .domain([0, d3.max(data, function(d) { return 2 })])
            .range([0, width]);
        svg.append("g")
            .attr("transform", "translate(0," + height + ")")
            .call(d3.axisBottom(x));

        // var y1 = d3.scaleLinear().range([height, 0]);

        var histogram = d3.histogram()
          .value(function(d) { return d.Churn; })   // I need to give the vector of value
          .domain(x.domain())  // then the domain of the graphic
          .thresholds(x.ticks(2))

        var bins = histogram(data);

        var y = d3.scaleLinear()
            .range([height, 0]);
            y.domain([0, d3.max(bins, function(d) { return d.length; })]);   // d3.hist has to be called before the Y axis obviously
        svg.append("g")
            .call(d3.axisLeft(y));

          // Add a tooltip div. Here I define the general feature of the tooltip: stuff that do not depend on the data point.
          // Its opacity is set to 0: we don't see it by default.
          var tooltip = d3.select("#graph-div")
            .append("div")
            .style("opacity", 0)
            .attr("class", "tooltip")
            .style("background-color", "white")
            .style("color", "#69b3a2")
            .style("border-radius", "5px")
            .style("padding", "10px")

          // A function that change this tooltip when the user hover a point.
          // Its opacity is set to 1: we can now see it. Plus it set the text and position of tooltip depending on the datapoint (d)
          var showTooltip = function(d) {
            tooltip
              .transition()
              .duration(100)
              .style("opacity", 1)
            tooltip
              .html("Churn: ".concat('', d.x0) )
              .style("left", (d3.mouse(this)[0]+40) + "px")
              .style("top", (d3.mouse(this)[1]) + "px")
          }
          var moveTooltip = function(d) {
            tooltip
            .style("left", (d3.mouse(this)[0]+20) + "px")
            .style("top", (d3.mouse(this)[1]) + "px")
          }
          // A function that change this tooltip when the leaves a point: just need to set opacity to 0 again
          var hideTooltip = function(d) {
            tooltip
              .transition()
              .duration(100)
              .style("opacity", 0)
          }

        svg.selectAll("rect")
          .data(bins)
          .enter()
          .append("rect")
            .attr("x", 1)
            .attr("transform", function(d) { return "translate(" + x(d.x0) + "," + y(d.length) + ")"; })
            .attr("width", function(d) { return x(d.x1) - x(d.x0) - 5; })
            .attr("height", function(d) { return height - y(d.length); })
            .style("fill", "#69b3a2")
            .on("mouseover", showTooltip )
            .on("mousemove", moveTooltip )
            .on("mouseleave", hideTooltip )

      })
      .catch(function(error){
         console.log(error)
      })
''')

In [77]:
js = js_template.substitute({'telco' : json.dumps(data), 'graphdiv' : 'graph-div'})

In [81]:
html_template = Template('''
<script src="https://d3js.org/d3.v4.min.js"></script>
<div id="graph-div"></div>
<script> $js </script>
''')

In [82]:
HTML(html_template.substitute({'js' : js}))

# Clean Data

In [151]:
def label_encode(df, col):
    col2idx = {k: i for i, k in enumerate(df[col].unique())}
    df[col] = df[col].map(col2idx)
    return col2idx

In [152]:
df = df.replace('Yes', 1)
df = df.replace('No', 0)

In [153]:
for col in df.columns:
    if 'No internet service' in df[col].unique():
        df[col] = df[col].replace('No internet service', 3)

  


In [154]:
gender2idx = label_encode(df, 'gender')
is2idx = label_encode(df, 'InternetService')
contract2idx = label_encode(df, 'Contract')
payment2idx = label_encode(df, 'PaymentMethod')
phoneservice2idx = label_encode(df, 'PhoneService')

df['OnlineSecurity'] = df['OnlineSecurity'].astype(float)
df['InternetService'] = df['InternetService'].astype(float)
df['OnlineBackup'] = df['OnlineBackup'].astype(float)
df['DeviceProtection'] = df['DeviceProtection'].astype(float)
df['TechSupport'] = df['TechSupport'].astype(float)
df['StreamingTV'] = df['StreamingTV'].astype(float)
df['StreamingMovies'] = df['StreamingMovies'].astype(float)

In [155]:
df.describe()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,Churn
count,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0
mean,0.504756,0.162147,0.483033,0.299588,32.371149,0.903166,0.872923,0.936675,0.994889,0.993895,0.940224,1.03436,1.03791,0.690473,0.592219,1.315633,64.761692,0.26537
std,0.500013,0.368612,0.499748,0.45811,24.559481,0.295752,0.737796,1.165986,1.142491,1.142921,1.16465,1.124568,1.122874,0.833755,0.491457,1.148907,30.090047,0.441561
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.25,0.0
25%,0.0,0.0,0.0,0.0,9.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.5,0.0
50%,1.0,0.0,0.0,0.0,29.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,70.35,0.0
75%,1.0,0.0,1.0,1.0,55.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,89.85,1.0
max,1.0,1.0,1.0,1.0,72.0,1.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,1.0,3.0,118.75,1.0


In [156]:
df['TotalCharges'] = df['TotalCharges'].astype(float, errors='ignore')

In [157]:
charges = []
for c in df['TotalCharges'].values:
    try:
        charges.append(float(c))
    except:
        charges.append(0)

In [158]:
df['TotalCharges'] = pd.Series(charges)

In [159]:
df['TotalCharges'] = df['TotalCharges'].replace(0, df['TotalCharges'].median())

In [160]:
df.to_csv('telco_cleaned.csv', index=False)

# EDA

In [161]:
df.describe()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
count,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0
mean,0.504756,0.162147,0.483033,0.299588,32.371149,0.903166,0.872923,0.936675,0.994889,0.993895,0.940224,1.03436,1.03791,0.690473,0.592219,1.315633,64.761692,2281.91236,0.26537
std,0.500013,0.368612,0.499748,0.45811,24.559481,0.295752,0.737796,1.165986,1.142491,1.142921,1.16465,1.124568,1.122874,0.833755,0.491457,1.148907,30.090047,2265.272185,0.441561
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.25,18.8,0.0
25%,0.0,0.0,0.0,0.0,9.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.5,402.225,0.0
50%,1.0,0.0,0.0,0.0,29.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,70.35,1394.55,0.0
75%,1.0,0.0,1.0,1.0,55.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,89.85,3786.6,1.0
max,1.0,1.0,1.0,1.0,72.0,1.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,1.0,3.0,118.75,8684.8,1.0


In [162]:
df['Churn'].value_counts()

0    5174
1    1869
Name: Churn, dtype: int64