## Importing Packages

In [1]:
#import the required libraries
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.ticker as mtick  
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
import plotly.graph_objs as go

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

# Feature Processing (Scikit-learn processing, etc. )
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
import scipy.stats as stats
from scipy.stats import chi2_contingency

# Machine Learning (Scikit-learn Estimators, Catboost, LightGBM, etc. )
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, fbeta_score
from sklearn.metrics import confusion_matrix

# Hyperparameters Fine-tuning (Scikit-learn hp search, cross-validation, etc. )
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

# Other packages
from tabulate import tabulate
import os, pickle

import warnings
warnings.filterwarnings('ignore')

import pickle

ModuleNotFoundError: No module named 'tabulate'

##  Load Data

In [None]:
voda = pd.read_csv('Telco-Customer-Churn.csv')

voda.sample(50)

### Data Understanding
This is a telecommunications company's customer dataset, containing various demographic and usage information for each customer, as well as whether or not they have churned (i.e. cancelled their service). Here are the meanings of the columns:

- customerID: unique identifier for each customer
- gender: customer gender (Male/Female)
- SeniorCitizen: indicates if the customer is a senior citizen or not (1, 0)
- Partner: indicates if the customer has a partner or not (Yes, No)
- Dependents: indicates if the customer has dependents or not (Yes, No)
- tenure: number of months the customer has stayed with the company
- PhoneService: indicates if the customer has a phone service or not (Yes, No)
- MultipleLines: indicates if the customer has multiple lines or not (Yes, No, No phone service)
- InternetService: type of internet service the customer has (DSL, Fiber optic, No)
- OnlineSecurity: indicates if the customer has online security or not (Yes, No, No internet service)
- DeviceProtection: indicates if the customer has device protection or not (Yes, No, No internet service)
- TechSupport: indicates if the customer has tech support or not (Yes, No, No internet service)
- StreamingTV: indicates if the customer has streaming TV or not (Yes, No, No internet service)
- StreamingMovies: indicates if the customer has streaming movies or not (Yes, No, No internet service)
- Contract: type of contract the customer has (Month-to-month, One year, Two year)
- PaperlessBilling: indicates if the customer has paperless billing or not (Yes, No)
- PaymentMethod: payment method used by the customer (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic))
- MonthlyCharges: amount charged to the customer monthly
- TotalCharges: total amount charged to the customer
- Churn: indicates if the customer has churned or not (Yes, No)

## Exploratory Data Analysis

#### Data Overview

In [None]:
# Checking the data types of all the columns
voda.dtypes

In [None]:
# Check the descriptive statistics of numeric variables
voda.describe().T

##### Comment

SeniorCitizen is actually a categorical hence the 25%-50%-75% distribution is not propoer

75% customers have tenure less than 55 months

Average Monthly charges are USD 64.76 whereas 25% customers pay more than USD 89.85 per month

In [None]:
voda.info()

#### Issues with Dataset:
* Datatype issues : Total charges should be in float dtype
* No missing data
* No duplicates
* The tenure columns needs to be categorized
* There are eleven blank values in the total charges column <br>

#### Approach:
* Change the datatype of the TotalCharges column to numeric.
* Fill the null values with '0' since those columns correspond to customers that have not began their sbscription with the telco company.
* Binning the tenure columns
* Encoding the Churn column into 1's and 0's

#### Data Cleaning

In [None]:
#Create a copy of the original data
data = voda.copy() 

##### Cleaning TotalCharges Column

In [None]:
#Typecast TotalCharges column to numeric
data.TotalCharges = pd.to_numeric(data.TotalCharges, errors='coerce')

#Checking number of missing values
data.isnull().sum()

In [None]:
#Fillin with 0s
data['TotalCharges'] = data['TotalCharges'].fillna(0)

##### Creating bins for tenure column

In [None]:
# Create new bin labels
labels = ["1-12", "13-24", "25-36","37-48", "49-60", "61-72"]

# Group tenure into new bins and update column
data['tenure_group'] = pd.cut(data.tenure, [1, 13, 25, 37, 49, 61, 73], right=False, labels=labels)

# Count values in new bins and sort by index
tenure_counts = data['tenure_group'].value_counts().sort_index()

##### Dropping Columns

In [None]:
#drop column customerID and tenure
data.drop(columns= ['customerID','tenure'], axis=1, inplace=True)

#### **Questions:**
#### 1. What is the most preferred internet service of customers?
#### 2. Understand customer demographics with respect to gender?
#### 3. Which service is most popular with customers?
#### 4. Are customers without dependents likely to have higher charges than those with dependent?

#### **Hypothesis**

##### **Null Hypothesis** : Senior citizen does not correlate with the tendency of customer churn
##### **Alternate Hypothesis** : Senior citizen  correlate with the tendency of customer churn

### Analysis

In [None]:
data.head()

###### Univariate Analysis

##### Univariate Analysis

In [None]:

for i, predictor in enumerate(data.drop(columns=['Churn', 'TotalCharges', 'MonthlyCharges'])):
    fig = px.histogram(data, x=predictor, color='Churn', barmode='group',
                       color_discrete_sequence=['#1f77b4', '#aec7e8'],
                       title=f"Countplot of {predictor} by Churn")
    fig.update_layout(xaxis_title=predictor, yaxis_title="Count",
                      legend_title="Churn", height=400)
    fig.show()


In [None]:
Mth_Chgs = sns.kdeplot(data.MonthlyCharges[(data["Churn"] == 'Yes') ],
                color="Blue", fill = True, alpha=.5)
Mth_Chgs = sns.kdeplot(data.MonthlyCharges[(data["Churn"] == 'No') ],
                ax =Mth_Chgs, color="Green", fill= True, alpha=.5)

Mth_Chgs.legend(["Churn", "No Churn"],loc='upper right')
Mth_Chgs.set_ylabel('Density')
Mth_Chgs.set_xlabel('Monthly Charges')
Mth_Chgs.set_title('Monthly charges by churn')

plt.show()

In [None]:
tot_Chgs = sns.kdeplot(data.TotalCharges[(data["Churn"] == 'No') ],
                color="Green", fill=True, alpha=.5)
tot_Chgs = sns.kdeplot(data.TotalCharges[(data["Churn"] == 'Yes') ],
                ax =tot_Chgs, color="Blue", fill=True, alpha=.3)
tot_Chgs.legend(["No Churn","Churn"],loc='upper right')
tot_Chgs.set_ylabel('Density')
tot_Chgs.set_xlabel('Total Charges')
tot_Chgs.set_title('Total charges by churn');


In [None]:
fig = px.scatter(data_frame=data, x='MonthlyCharges', y='TotalCharges', 
                 trendline='ols', color='Churn', title='Monthly Charges vs Total Charges')

fig.update_layout(xaxis_title='Monthly Charges', yaxis_title='Total Charges',
                  margin=dict(l=50, r=50, t=50, b=50), height=400)

fig.show()

Derived Insight:

**HIGH** Churn seen in case of **Month to month contracts**, **No online security, No Tech support, First year of subscription** and **Fibre Optics Internet**

**LOW** Churn is seens in case of **Long term contracts, Subscriptions without internet service** and **The customers engaged for 5+ years**


##### Hypothesis Testing

##### Senior citizens and the tendecy to the customer churn

Null Hypothesis : Senior citizen does not correlate with the tendency of customer churn.<br>
Alternate Hypothesis : Senior citizen correlate with the tendency of customer churn.

In [None]:
data['Churn'] = np.where(data.Churn == 'Yes',1,0)

In [None]:
from scipy.stats import chi2_contingency
Crosstabresults = pd.crosstab(index= voda['SeniorCitizen'], columns = voda['Churn'])
chisqresult  = chi2_contingency(Crosstabresults)

In [None]:
print('p-value: ', chisqresult[1])

Insights:<br>
P-value is less than 0.05, which implies that we reject our null hypothesis. Senior citizen are more likely to churn. 

##### Gender and the tendency to customer churn 

Null Hypothesis :Gender does not correlate with the tendency of customer churn.<br>
Alternate Hypothesis : Gender citizen correlate with the tendency of customer churn.

3. Convert all the categorical variables into dummy variables

In [None]:
Crosstabresults1 = pd.crosstab(index= voda['gender'], columns = voda['Churn'])
chisqresult1  = chi2_contingency(Crosstabresults1)

In [None]:
chisqresult1[1]

9. Relationship between Monthly Charges and Total Charges

Insights:<br>
P-value is greater than 0.05, which implies that we accept our null hypothesis. A customer's tendency to churn soes not depend on their gender.


### 1. What is the most preferred internet service of customers?


In [None]:
#Aggregating count of customers over the internet service columns
inter_serv = voda.groupby('InternetService')['customerID'].count().reset_index()
inter_serv.rename(columns={'customerID':'Number'}, inplace =True)
inter_serv

In [None]:
data_inter_serv = inter_serv['Number']
keys = inter_serv['InternetService']

fig = px.pie(values=data_inter_serv, names=keys, title='Customer Internet Preference', 
             color_discrete_sequence=px.colors.sequential.Blues, 
             labels={'value': 'Number'})

fig.update_traces(textposition='inside', textinfo='percent+label', pull=[0.1, 0.1, 0])

fig.show()

### 2. Understand customer demographics with respect to gender

In [None]:
#Splitting Data into customer demographic columns
cus_demo = voda.loc[:,'customerID':'Dependents']
cus_demo.rename(columns = {'customerID': 'Number'},inplace = True)
cus_demo

In [None]:
#Aggregate number of customers over demographic columns: Senior Citizen, Partner and Dependents
cus_demo1 = cus_demo.groupby(['SeniorCitizen','gender'])['Number'].count().reset_index()
cus_demo2 = cus_demo.groupby(['Partner', 'gender'])['Number'].count().reset_index()
cus_demo3 = cus_demo.groupby(['Dependents', 'gender'])['Number'].count().reset_index()

In [None]:
fig1 = px.bar(cus_demo1, x='SeniorCitizen', y='Number', color='gender', barmode='group',
              category_orders={'SeniorCitizen': [0, 1]}, color_discrete_sequence=px.colors.qualitative.Pastel1)

fig2 = px.bar(cus_demo2, x='Partner', y='Number', color='gender', barmode='group',
              category_orders={'Partner': ['Yes', 'No']}, color_discrete_sequence=px.colors.qualitative.Pastel1)

fig3 = px.bar(cus_demo3, x='Dependents', y='Number', color='gender', barmode='group',
              category_orders={'Dependents': ['Yes', 'No']}, color_discrete_sequence=px.colors.qualitative.Pastel1)

fig1.update_layout(title='Customer Demographics', xaxis_title='Senior Citizen', yaxis_title='Number')
fig2.update_layout(xaxis_title='Partner', yaxis_title='Number')
fig3.update_layout(xaxis_title='Dependents', yaxis_title='Number')

fig1.show()
fig2.show()
fig3.show()

Insights:
* Majority of  customers are not senior citizens. That is , the client base is largely young people.
* Majority of the customers are not married. 
* Majority of the customers do not have dependents.

### 3. Which service is most popular with customers?

In [None]:
#slicing vodafone services from the original data
cus_serv = data.loc[:,'PhoneService':'StreamingMovies']

#Aggregating the count over the columns
serv_tally = cus_serv[cus_serv[['PhoneService','MultipleLines','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']]=='Yes'].count()

In [None]:
serv_tally['InternetService'] = inter_serv['Number'].loc[0:1].sum()

In [None]:
fig = px.box(data_frame=data, x='Dependents', y='MonthlyCharges', color='Dependents',
             color_discrete_sequence=['#1f77b4', '#ff7f0e'])
fig.update_layout(title='Monthly Charges by Dependents',
                  xaxis_title='Dependents', yaxis_title='Monthly Charges')
fig.show()

In [None]:
# Sorting in ascending order
serv_tally.sort_values(ascending=True, inplace=True)

# Creating a horizontal bar chart
fig = go.Figure(go.Bar(
    x=serv_tally.values,
    y=serv_tally.index,
    orientation='h',
    marker_color='blue' # Setting color of the bars to blue
))

fig.update_layout(title='Most Popular Service', xaxis_title='Number of Customers')
fig.show()

### 4. Are customers without dependents likely to have higher charges than those with dependent?

In [None]:
fig = px.box(data_frame=data, x='Dependents', y='TotalCharges', color='Dependents',
             category_orders={'Dependents': ['No', 'Yes']},
             color_discrete_sequence=['#1f77b4', '#aec7e8'])
fig.update_layout(title='Distribution of Total Charges by Dependents Status',
                  xaxis_title='Dependents', yaxis_title='Total Charges')
fig.show()

### Feature Processing - ML

In [None]:
data.head()

In [None]:
y = data['Churn']
X = data.drop('Churn', axis =1)

#### Feature Scaling

In [None]:
# initialize MinMaxScaler object with specified parameters
scaler = MinMaxScaler(feature_range=(0, 1), copy=True)

# select numerical columns to scale
num_cols = X.select_dtypes(include='number').columns

# scale numerical columns using MinMaxScaler
X[num_cols] = scaler.fit_transform(X[num_cols])

#### Feature Encoding

In [None]:
X = pd.get_dummies(X)
y = LabelEncoder().fit_transform(y)

### Data Splitting

In [None]:
x_train,x_eval,y_train,y_eval=train_test_split(X,y,test_size=0.2)

### Balancing Dataset

Approach:

* In  order to balance the dataset, we can either oversample(increase the minority class) or undersample(decreasing the majority class)
* With regards to our dataset, it would be best to oversample since undersampling has tendency to remove our Not Churn data from the set.

In [None]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x_train,y_train)

### Modelling 

The following models would be used :

* Decision Tree Classifier 
* Random Forest
* Gradient Boosting Classifier
* Logistic Regression

#### 1. Decision Tree Classifier - Without balancing

In [None]:
tree_clf = DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [None]:
tree_clf.fit(x_train,y_train)

##### Decision Tree Predictions

In [None]:
tree_clf_pred= tree_clf.predict(x_eval)

##### Decision Tree Classification Report

In [None]:
print(classification_report(y_eval, tree_clf_pred, labels=[0,1]))

As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model..

Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.




##### Decision Tree  Classifier with balancing

In [None]:
tree_clf_bal = DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [None]:
tree_clf_bal.fit(X_resampled,y_resampled)
tree_bal_pred = tree_clf_bal.predict(x_eval)

print(metrics.classification_report(y_eval,tree_bal_pred))

In [None]:
print(metrics.confusion_matrix(y_eval, tree_bal_pred))

Now we can see quite better results, i.e. Accuracy: 62%, and a very good recall, precision & f1 score for minority class.
Let's try with some other classifier.

#### 2. Random Forest Classifier - Without Balancing

In [None]:
forest_clf = RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [None]:
forest_clf.fit(x_train,y_train)

In [None]:
forest_clf_pred = forest_clf.predict(x_eval)

In [None]:
print(classification_report(y_eval, forest_clf_pred, labels=[0,1]))

##### Comment:

Good enough, however lets check with the balance dataset

#### 2. Random Forest Classifier - With Balancing

In [None]:
forest_clf_bal=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [None]:
forest_clf_bal.fit(X_resampled,y_resampled)

In [None]:
forest_bal_pred = forest_clf_bal.predict(x_eval)

In [None]:
print(metrics.classification_report(y_eval, forest_bal_pred))

In [None]:
print(metrics.confusion_matrix(y_eval, forest_bal_pred))

###### Comment:
 After balancing, the f1 score has marginally improved at the expense of accuracy. This is because prior balancing, the machine was more biased towards the majority class

#### 3. Gradient Boosting Classifier Model _without balancing

In [None]:
gb_clf = GradientBoostingClassifier(criterion='friedman_mse', random_state=100, max_depth=6, min_samples_leaf=8)

In [None]:
gb_clf.fit(x_train, y_train)

In [None]:
gb_clf_pred = gb_clf.predict(x_eval)

In [None]:
print(classification_report(y_eval, gb_clf_pred, labels=[0,1]))

#### Gradient Boosting Classifier with balancing

In [None]:
gb_clf_bal = GradientBoostingClassifier(criterion='friedman_mse', random_state=100, max_depth=6, min_samples_leaf=8)

In [None]:
gb_clf_bal.fit(X_resampled,y_resampled)

In [None]:
gb_bal_pred= gb_clf_bal.predict(x_eval)

In [None]:
print(metrics.confusion_matrix(y_eval, gb_bal_pred))

In [None]:
print(metrics.classification_report(y_eval, gb_bal_pred))

##### Comment:

Yes, a far better result with the Gradient Boosting Model on the balanced dataset.. we can still check for more classifiers

#### 4. Logistic Regression - without balancing

In [None]:
lr = LogisticRegression(C=1.0, random_state=101)

In [None]:
lr.fit(x_train, y_train)

In [None]:
# Make predictions on the test data using the fitted model
lr_pred = lr.predict(x_eval)

In [None]:
print(classification_report(y_eval, lr_pred, labels=[0,1]))

#### Logistic Regression - with balancing

In [None]:
lr_bal = LogisticRegression(C=1.0, random_state=101)

In [None]:
lr_bal.fit(X_resampled,y_resampled)

In [None]:
lr_bal_pred = lr_bal.predict(x_eval)

In [None]:
print(metrics.confusion_matrix(y_eval, lr_bal_pred))

In [None]:
print(metrics.classification_report(y_eval, lr_bal_pred))

##### Comment
We can see this also is far better: Now that we now the balanced data is better in results.

 ### Model Evaluation

In [None]:
metrics_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

In [None]:
models = [
    (DecisionTreeClassifier(criterion="gini", random_state=100, max_depth=6, min_samples_leaf=8), 'Decision Tree'),
    (RandomForestClassifier(n_estimators=100, criterion='gini', random_state=100, max_depth=6, min_samples_leaf=8), 'Random Forest Classifier'),
    (GradientBoostingClassifier(criterion='friedman_mse', random_state=100, max_depth=6, min_samples_leaf=8), 'Gradient Boosting Classifier'),
    (LogisticRegression(C=1.0, random_state=100), 'Logistic Regression Model')
]

metrics_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

for model, model_name in models:
    model.fit(x_train, y_train)
    y_pred = model.predict(x_eval)

    accuracy = accuracy_score(y_eval, y_pred)
    precision = precision_score(y_eval, y_pred)
    recall = recall_score(y_eval, y_pred)
    f1 = f1_score(y_eval, y_pred)

    metrics_df = metrics_df.append({'Model': model_name, 'Accuracy': accuracy,
                                    'Precision': precision, 'Recall': recall, 'F1 Score': f1}, ignore_index=True)
    
    metrics_df = metrics_df.sort_values(by='F1 Score', ascending=False)

# Print the final metrics dataframe
print(metrics_df)


In [None]:
metrics_df

In [None]:

# Create a list of models and their corresponding parameters
models = [
    (DecisionTreeClassifier(criterion="gini", random_state=100, max_depth=6, min_samples_leaf=8), 'Decision Tree'),
    (RandomForestClassifier(n_estimators=100, criterion='gini', random_state=100, max_depth=6, min_samples_leaf=8), 'Random Forest Classifier'),
    (GradientBoostingClassifier(criterion='friedman_mse', random_state=100, max_depth=6, min_samples_leaf=8), 'Gradient Boosting Classifier'),
    (LogisticRegression(C=1.0, random_state=100), 'Logistic Regression Model')

]

metrics_bal_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x_train, y_train)


for model, model_name in models:
    model.fit(X_resampled, y_resampled)
    yr_predict = model.predict(x_eval)

    accuracy = accuracy_score(y_eval, yr_predict)
    precision = precision_score(y_eval, yr_predict)
    recall = recall_score(y_eval, yr_predict)
    f1 = f1_score(y_eval, yr_predict)

    metrics_bal_df =  metrics_bal_df.append({'Model': model_name, 'Accuracy': accuracy,
                                                'Precision': precision, 'Recall': recall, 'F1 Score': f1}, ignore_index=True)

# Sort the dataframe in descending order based on Accuracy
metrics_bal_df =  metrics_bal_df.sort_values(by='F1 Score', ascending=False)

# Print the final metrics dataframe
print( metrics_bal_df)


In [None]:
 metrics_bal_df

##### Comment:

From the two tables, it can be observed that:
* Balancing the target variables improved the performance of our models
* The two best performing models based on the F1 score are Gradient Boosting Classifier and Logistic Regression Classifier.

### Hyperparameter Tuning

In [None]:
models = [ gb_clf_bal, lr_bal]

#### Tuning Gradient Boosting Classifier Model

In [None]:
models[0].get_params()

In [None]:
gb_clf_params = { 'ccp_alpha': [0.0,0.1,0.2],
           'max_depth': [4,6,8],
           'min_samples_leaf': [8, 10, 12],
          'n_estimators': [100,1000]
}

In [None]:
searcher = GridSearchCV(estimator = gb_clf_bal,
                        param_grid = gb_clf_params,
                        scoring = ['accuracy','balanced_accuracy','f1','precision','recall','roc_auc'],
                        refit = 'balanced_accuracy',
                        cv = 5,
                        verbose = 3)

In [None]:
searcher.fit(X_resampled,y_resampled)

In [None]:
gb_clf_tuned = searcher.best_estimator_

In [None]:
gb_clf_tuned

In [None]:
models.append(gb_clf_tuned)

In [None]:
models

#### Tuning the Logistic Regression Model

In [None]:
models[1].get_params()

In [None]:
lr_params = { 'C': [1.0,2.0,3.0],
           'max_iter': [1000,10000,100000],
           'intercept_scaling': [1, 2, 3,]
    
    
}

In [None]:
searcher_lr = GridSearchCV(estimator = lr_bal,
                             param_grid = lr_params,
                             scoring = ['accuracy','balanced_accuracy','f1','precision','recall','roc_auc'],
                             refit = 'balanced_accuracy',
                             cv = 5,
                             verbose = 3
    )

In [None]:
searcher_lr.fit(X_resampled, y_resampled)

In [None]:
lr_tuned = searcher_lr.best_estimator_

In [None]:
models.append(lr_tuned)

In [None]:
models

### Exporting Key Components

In [None]:
components = { 'scaler': scaler,
               'model': models
    }

In [None]:
components

In [None]:
#Create Folder
!mkdir export

In [None]:
#Create a destination folder
destination = os.path.join('.','export')

In [None]:
#export
with open(os.path.join(destination,'ml.pkl'),'wb') as f:
    pickle.dump(components, f)

In [None]:
#requirements
!pip freeze requirement.txt

In [None]:
#save file in export
!pip freeze > export/requirement.txt