# Project Folder Structure

Project Folder Structure

|->`Drive`/VisitWithUs-Tourism

  |->Master
  
    |->Data
        |->tourism.csv
        |->train.csv
        |->test.csv
    |->Deployment
        |->app.py
        |->Dockerfile
        |->README.md
        |->requirements.txt
    |->Model_Building
        |->BuildingModels.py
        |->DataPrepration.py
        |->DataRegistration.py
    |->Model_Dump_JOBLIB
        |->best_threshold.txt
        |->BestModel_XGBoostingClassifier.joblib
    |->pipeline.yml
  |->AIML_MLOPS_v1_1.ipynb

# Loading Packages

In [None]:
!pip install huggingface_hub
!pip install python-dotenv
!pip install datasets
!pip install pandas
!pip install scikit-learn
!pip install xgboost
!pip install seaborn
!pip install matplotlib
!pip install joblib
!pip install stramlit

Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.1-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.1
[31mERROR: Could not find a version that satisfies the requirement stramlit (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for stramlit[0m[31m
[0m

# MOUNTING DRIVE

In [65]:
import os
from google.colab import drive
drive.mount('/content/drive/')
%cd '/content/drive/MyDrive/PGP_AI_ML_GREAT_LEARNING/10_Advance_Machine_Learning_And_MLOps/Final_Project/'
# base_path = 'VisitWithUs-Tourism_version_1_1/Master/'
# print(f"Base Path {base_path}")

# data_path = os.path.join(base_path, 'Data')
# model_joblib_path = os.path.join(base_path, 'Model_Dump_JOBLIB')

# print("checking for Program folder created or not")
# if not os.path.exists(base_path):
#   os.makedirs(base_path, exist_ok=True)
# else:
#   print(f"folder already exists {base_path}")

# if not os.path.exists(data_path):
#   os.makedirs(data_path, exist_ok=True)
# print(f"folder already exists {data_path}")

# if not os.path.exists(model_joblib_path):
#   os.makedirs(model_joblib_path, exist_ok=True)
# print(f"folder already exists {model_joblib_path}")


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/MyDrive/PGP_AI_ML_GREAT_LEARNING/10_Advance_Machine_Learning_And_MLOps/Final_Project


In [None]:
!ls

 Reference  'version 0'  'version 0.1'	 VisitWithUs-Tourism_version_1_1


# 1. DATA REGISTRATION

In [None]:
# @title DataRegistration Class
#%%writefile $base_path/DataRegistration.py
import os
import traceback
import inspect
from huggingface_hub import HfApi, create_repo,login,hf_hub_download

class DataRegistration:
  def __init__(self,base_path,hf_token=None):
    print(f"Function Name {inspect.currentframe().f_code.co_name}")
    self.repoID = 'jpkarthikeyan/Tourism-visit-with-us-dataset'
    self.Subfolders = os.path.join(base_path,'Data')
    self.folder_Master = base_path
    self.folder_data = os.path.join(base_path,"Data")
    self.hf_token = hf_token
    os.makedirs(self.folder_data, exist_ok=True)
    print(f"self.Subfolders: {self.Subfolders}")
    print(f"self.folder_Master: {self.folder_Master}")
    print(f"folder_data: {self.folder_data}")

  def HFCreateRepo(self):
    print(f"Function Name {inspect.currentframe().f_code.co_name}")
    try:
      create_repo(repo_id=self.repoID,
                  private=False,
                  repo_type='dataset',
                  exist_ok=True)
      print(f"Repo {self.repoID} created")
      return True

    except Exception as ex:
      if hasattr(ex,'response') and ex.response.status_code == 409:
        print(f"Repo {self.repoID} already exists")
        return True
      else:
        print(f"Exception {ex}")
        traceback.print_exc()
        return False
    finally:
      print("-"*100)


  def UploadingSourceData(self):
    print(f"Function Name {inspect.currentframe().f_code.co_name}")
    try:
      source_data_file = os.path.join(self.folder_data,'tourism.csv')
      print(f"Soruce Data File {source_data_file}")
      if not os.path.exists(source_data_file):
        raise FileNotFoundError(f"File {source_data_file} not found")
      api = HfApi()
      api.upload_file(
          path_or_fileobj = source_data_file,
          path_in_repo = 'Master/Data/tourism.csv',
          repo_id = self.repoID,
          repo_type='dataset',
          token=self.hf_token)
      print(f"Source data tourism.csv uploaded into {self.repoID}")
      return True

    except Exception as ex:
       print(f"Exception at {inspect.currentframe().f_code.co_name} Exception: {ex}")
       traceback.print_exc()
       return False
    finally:
      print("-"*100)

  def ToRunPipeline(self):
    print(f"Function Name {inspect.currentframe().f_code.co_name}")
    if not self.HFCreateRepo():
      print('Exception in data registration HFCreateRepo')
      return False
    else:
      if not self.UploadingSourceData():
        print('Exception in data registration UploadingSourceData')
        return False
      else:
        print('Data Registration Completed')
        return True

Overwriting VisitWithUs-Tourism_version_1_1/Master//DataRegistration.py


# 2. Data Prepration

In [None]:
#%%writefile $base_path/DataPrepration.py
import os
import pandas as pd
import inspect
import traceback
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from huggingface_hub import HfApi, create_repo, login, hf_hub_download

class DataPrepration:
  def __init__(self,base_path, hf_token=None):
    print(f"Function Name {inspect.currentframe().f_code.co_name}")
    self.repoID = 'jpkarthikeyan/Tourism-visit-with-us-dataset'
    self.Subfolders = os.path.join(base_path, 'Data')
    self.hf_token = hf_token

  def LoadDatasetFromHF(self):
    print(f"Function Name {inspect.currentframe().f_code.co_name}")
    try:
      df_dataset = pd.read_csv(hf_hub_download(
                                repo_id = self.repoID,
                                filename = 'Master/Data/tourism.csv',
                                repo_type='dataset'
                              ))

      print(f'Shape of the original dataset {df_dataset.shape}')

      if 'Unnamed: 0' in df_dataset.columns:
        df_dataset = df_dataset.drop(['Unnamed: 0'],axis=1)

      print(f"Dataset loaded from {self.repoID}/{self.Subfolders}")
      print(f"Shape of the Original Dataset: {df_dataset.shape}")
      return df_dataset
    except Exception as ex:
      print(f"Exception {ex}")
      traceback.print_exc()
      return None
    finally:
      print('-'*50)

  def TrainTestSplit(self,df_dataset):
    print(f"Function Name {inspect.currentframe().f_code.co_name}")
    try:
      print(f"Value Count {df_dataset['ProdTaken'].value_counts()}")

      df_train,df_test = train_test_split(df_dataset,
                                          test_size=0.2,
                                          random_state=42,
                                          stratify=df_dataset['ProdTaken'],
                                          shuffle=True)

      print(f"Shape of the train dataset: {df_train.shape}")
      print(f"Shape of the test dataset: {df_test.shape}")

      return df_train, df_test
    except Exception as ex:
      print(f'Exception: {ex}')
      print(traceback.print_exc())
      return None, None
    finally:
      print('-'*50)

  def DatasetCleaning(self,df_data):
    try:
      print(f"Function Name {inspect.currentframe().f_code.co_name}")
      df_data['Gender'] = df_data['Gender'].replace('Fe Male', 'Female')

      df_data = df_data.drop_duplicates(subset=['CustomerID'], keep='first').reset_index(drop=True)

      for clmn in df_data.columns:
        if df_data[clmn].dtype in ['int64']:
          #print(f"{clmn} replacing the missing value with median")
          df_data[clmn] = df_data[clmn].fillna(df_data[clmn].median())
        else:
          #print(f"{clmn} replacing the missing value with mode")
          df_data[clmn] = df_data[clmn].fillna(df_data[clmn].mode()[0])

      df_data = df_data.drop(['CustomerID'], axis=1)

      numerical_column = df_data.select_dtypes(include=['int64'])

      for num_col in numerical_column:
        Q1 = df_data[num_col].quantile(0.25)
        Q3 = df_data[num_col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5*IQR
        upper = Q3 + 1.5*IQR
        #df_data[num_col] = df_data[num_col].clip(lower,upper)

      return df_data

    except Exception as ex:
      print(f"Exception {ex}")
      print(traceback.print_exc())
      return None
    finally:
      print('-'*50)

  def UploadIntoHF(self,df,drive_path,file_name):
    print(f"Function Name {inspect.currentframe().f_code.co_name}")
    try:
      file_path = os.path.join(drive_path,file_name)
      df.to_csv(file_path,index=False)

      api = HfApi(token = self.hf_token)
      api.upload_file(path_or_fileobj =file_path,
                      path_in_repo= f"Master/Data/{file_name}",
                      repo_id = self.repoID,
                      repo_type='dataset',
                      token=self.hf_token)
      print(f"Source data {file_name} uploaded into {self.repoID}")
      return True
    except Exception as ex:
      print(f"Exception: {ex}")
      traceback.print_exc()
      return False
    finally:
      print('-'*50)

  def ToRunPipeline(self):
    try:
      print(f"Function Name {inspect.currentframe().f_code.co_name}")
      df_dataset = self.LoadDatasetFromHF()
      if df_dataset is None:
        return False
      else:
        df_train, df_test = self.TrainTestSplit(df_dataset)
        if df_train is None or df_test is None:
          return False
        else:
          df_train_cleaned = self.DatasetCleaning(df_train)
          df_test_cleaned = self.DatasetCleaning(df_test)
          if df_train is None or df_test is None:
            return False
          else:
            result_train = self.UploadIntoHF(df_train_cleaned,
                                             self.Subfolders,'train.csv')
            result_test = self.UploadIntoHF(df_test_cleaned,
                                            self.subfolders,'test.csv')
            if not result_train or not result_test:
              print('Splitted dataset upload into HF Exception')
              return False
            else:
              print('Dataset downloaded from HF, Cleaned, Splitted into train and test dataset and uploaded back into HF dataset')
              return True
    except Exception as ex:
      print(f"Exception message in ToRunPipeline: {ex}")
      traceback.print_exc()
      return False
    finally:
      print('-'*50)

# 3.Model Building with Experimentation Tracking

In [None]:
#%%writefile $base_path/BuildingModels.py

import os
import joblib
import inspect
import traceback
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from datasets import load_dataset
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from huggingface_hub.utils import RepositoryNotFoundError
from huggingface_hub import HfApi, create_repo, login
from huggingface_hub import hf_hub_download
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import precision_recall_curve, precision_score
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder


class BuildingModels:
  def __init__(self,base_path, hf_token=None):
    print(f"Function Name {inspect.currentframe().f_code.co_name}")
    self.models = {}
    self.best_model = None
    self.best_score = 0
    self.best_f1_score =0.0
    self.best_model_threshold = 0.0
    self.best_model_name=None
    self.df_train = pd.DataFrame()
    self.df_test = pd.DataFrame()
    self.feature_train = pd.DataFrame()
    self.feature_test = pd.DataFrame()
    self.target_train = pd.Series()
    self.target_test = pd.Series()
    self.base_path = base_path
    self.Subfolders = os.path.join(base_path,'data')
    self.repo_id = 'jpkarthikeyan/Tourism_Prediction_Model'
    self.ds_repo_id = 'jpkarthikeyan/Tourism-visit-with-us-dataset'
    self.repo_type = 'model'
    self.hf_token = hf_token
    self.categorical_columns = ['TypeofContact','Occupation','Gender','ProductPitched','MaritalStatus','Designation']
    self.numerical_columns = ['Age','CityTier','DurationOfPitch','NumberOfPersonVisiting',
                              'NumberOfFollowups','PreferredPropertyStar',
                              'NumberOfTrips','Passport','PitchSatisfactionScore','OwnCar',
                              'NumberOfChildrenVisiting','MonthlyIncome']

    self.pipeline_numerical = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    self.pipeline_onehot = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first',handle_unknown='ignore',sparse_output=False))
    ])

  def Load_data_from_HF(self):
    print(f"Function Name {inspect.currentframe().f_code.co_name}")
    try:
      print(f'Loading the train dataset from {self.ds_repo_id}')

      self.df_train = pd.read_csv(hf_hub_download(
                                repo_id = self.ds_repo_id,
                                filename = 'Master/Data/train.csv',repo_type='dataset'))
      self.df_test = pd.read_csv(hf_hub_download(
                                repo_id = self.ds_repo_id,
                                filename = 'Master/Data/test.csv',repo_type='dataset'))
      print(f"Shape of the train dataset: {self.df_train.shape}")
      print(f"Shape of the train dataset: {self.df_test.shape}")

      return True
    except Exception as ex:
      print(f"Exception: {ex}")
      traceback.print_exc()
      return False
    finally:
      print('-'*50)

  def Preprocessing_dataset(self):
    print(f"Function Name {inspect.currentframe().f_code.co_name}")
    try:

      self.target_train = self.df_train['ProdTaken']
      self.feature_train = self.df_train.drop(['ProdTaken'],axis=1)

      self.target_test = self.df_test['ProdTaken']
      self.feature_test = self.df_test.drop(['ProdTaken'],axis=1)

      return True

    except Exception as ex:
      print(f"Exception: {ex}")
      traceback.print_exc()
      return False
    finally:
      print('-'*50)

  def Building_Models(self):
    print(f"Function Name {inspect.currentframe().f_code.co_name}")
    try:
      preprocessor = ColumnTransformer(
          transformers=[
              ('num', self.pipeline_numerical,self.numerical_columns),
              ('onehot', OneHotEncoder(drop='first',handle_unknown='ignore',
                        sparse_output=False),self.categorical_columns)])
      models_params = {
          'DecisionTreeClassifier':{
              'model': DecisionTreeClassifier(class_weight='balanced',random_state=42),
              'params': {'classifier__criterion':['gini','entropy'],
                         'classifier__splitter':['best','random'],
                        'classifier__max_depth':[1],
                         'classifier__min_samples_leaf':[1,2,4],
                         'classifier__min_samples_split':[2,5,10],
                         'classifier__max_features':['sqrt','log2',None]}
          },

          'RandomForestClassifier':{
              'model': RandomForestClassifier(class_weight='balanced',random_state=42),
              'params': { 'classifier__n_estimators':[25,50,75,100],
                          'classifier__criterion':['gini','entropy'],
                          'classifier__max_depth':[5,10,15],
                          'classifier__min_samples_split':[15,20,25],
                          'classifier__min_samples_leaf':[7,10,15],
                          'classifier__max_features':[0.3,0.5,0.6],
                          'classifier__oob_score':[True],
                          'classifier__bootstrap':[True]
                         }
          },

          'BaggingClassifier':{
              'model': BaggingClassifier(estimator=DecisionTreeClassifier(class_weight='balanced',random_state=42)),
              'params':{  'classifier__n_estimators':[10,50,75,100],
                          'classifier__max_samples':[0.3,0.5,0.7,0.9],
                          'classifier__max_features':[0.3,0.5,0.7],
                          'classifier__oob_score':[True],
                          'classifier__estimator__criterion':['gini','entropy'],
                          'classifier__estimator__max_depth':[5,7,9],
                          'classifier__estimator__min_samples_split':[8,10,12],
                          'classifier__estimator__min_samples_leaf':[2,3,5]
                        }
          },

          'AdaBoostingClassifier':{
              'model': AdaBoostClassifier(random_state=42),
              'params':{  'classifier__n_estimators':[50,75,100],
                          'classifier__learning_rate':[0.01,0.05,0.1],
                          'classifier__algorithm':['SAMME','SAMME.R']

                      }
          },

          'GradientBoostingClassifier':{
              'model': GradientBoostingClassifier(random_state=42),
              'params':{
                          'classifier__n_estimators':[50,75,100,125],
                          'classifier__learning_rate':[0.01,0.5,0.1],
                          'classifier__criterion':['friedman_mse','squared_error'],
                          'classifier__max_features':['sqrt','log2'],
                          'classifier__min_samples_leaf':[1,2,4],
                          'classifier__subsample':[0.6,0.7,0.8],
                          'classifier__max_depth':[2,3,4,5]
                        }
          },

          'XGBoostingClassifier':{
              'model':XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
              'params':{'classifier__n_estimators':np.arange(50,100,10),
                        'classifier__max_depth': [3,5,7],
                        'classifier__learning_rate':[0.01,0.1,0.2],
                        'classifier__subsample':[0.6,0.8,1.0],
                        'classifier__colsample_bytree':[0.6,0.8,1.0],
                        'classifier__gamma':[0,1,2],
                        'classifier__reg_alpha':[0,1,2]

                        }
          }

        }

      cv_KFold = KFold(n_splits=3,random_state=42,shuffle=True)

      for model_name, mdl_params in models_params.items():
        print(f'Model {model_name} started')
        pipeline = Pipeline(steps=[
            ('preprocessor',preprocessor),
            ('classifier',mdl_params['model'])
        ])
        random_search = RandomizedSearchCV(pipeline,mdl_params['params'],
                                           n_iter=50,cv=cv_KFold,scoring='f1',
                                           random_state=42,n_jobs=-1,verbose=2)

        random_search.fit(self.feature_train,self.target_train)

        self.models[model_name] = {
            'model':random_search.best_estimator_,
            'best_score': random_search.best_score_,
            'best_params':random_search.best_params_
        }
        joblib.dump(random_search.best_estimator_,f'{self.base_path}/Model_Dump_JOBLIB/{model_name}.joblib')
        print(f'model:{random_search.best_estimator_}')
        print(f'best_score: {random_search.best_score_}')
        print(f'best_params: {random_search.best_params_}')
        print(f'Modle {model_name} completed')
        print('-'*50)

      return self.models
    except Exception as ex:
      print(f"Exception: {ex}")
    finally:
      print('-'*50)

  def Model_Evaluation(self):
    print(f"Function Name {inspect.currentframe().f_code.co_name}")
    df_metrics = pd.DataFrame()
    try:
      for mdl_name, mdl_info in self.models.items():
        model = mdl_info['model']
        predict_proability = model.predict_proba(self.feature_test)
        print(f"Predict proability shape {mdl_name} {predict_proability.shape}")
        if predict_proability.shape[1] ==1:
          predict_proability = predict_proability.flatten()
        else:
          predict_proability = predict_proability[:,1]


        prc_precision,prc_recall, prc_threshold = precision_recall_curve(self.target_test,predict_proability)
        prc_f1score = 2*((prc_precision*prc_recall) / (prc_precision+prc_recall+1e-10))

        prc_threshold_idmx = np.argmax(prc_f1score)
        prc_best_threshold = prc_threshold[prc_threshold_idmx]
        print(f'best threshold: {prc_best_threshold}')

        predic_prob_threshold = (predict_proability >= prc_best_threshold).astype(int)
        #predic_prob_threshold = (predict_proability >= 0.5).astype(int)
        accuracy = accuracy_score(self.target_test,predic_prob_threshold)
        precision = precision_score(self.target_test,predic_prob_threshold)
        recall = recall_score(self.target_test,predic_prob_threshold)
        f1score = f1_score(self.target_test,predic_prob_threshold)
        class_report = classification_report(self.target_test,predic_prob_threshold)
        conf_matrix = confusion_matrix(self.target_test,predic_prob_threshold)

        lbl = ['TN', 'FP', 'FN', 'TP']
        cnf_lbl = ['\n{0:0.0f}'.format(cnf_val) for cnf_val in conf_matrix.flatten()]
        cn_percentage = ["\n{0:.2%}".format(item/conf_matrix.flatten().sum()) for item in conf_matrix.flatten()]

        confusion_label = np.asarray([["\n {0:0.0f}".format(item)+"\n{0:.2%}".format(item/conf_matrix.flatten().sum())]
                                for item in conf_matrix.flatten()]).reshape(2,2)

        cnf_label = np.asarray([f'{lbl1} {lbl2} {lbl3}' for lbl1, lbl2, lbl3 in zip(lbl, cnf_lbl,  cn_percentage)]).reshape(2,2)

        plt.figure(figsize = (3,3))
        sns.heatmap(conf_matrix, annot = cnf_label, cmap = 'Spectral', fmt='' )
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title(f'{mdl_name} confusion matrix')
        plt.tight_layout()
        plt.show()

        df_metrics = pd.concat([df_metrics,pd.DataFrame({'model':[mdl_name],'accuracy':[accuracy],
                                            'precision':[precision], 'recall':[recall],
                                            'f1_score':[f1score]})],ignore_index=True)

        if f1score > self.best_f1_score:
          self.best_f1_score = f1score
          self.best_model_threshold = prc_best_threshold
          self.best_model_name = mdl_name

      best_model = self.models[self.best_model_name]['model']
      if hasattr(best_model, 'feature_importances_'):
        feature_importance = pd.DataFrame({
            'feature':self.feature_train.columns,
            'importance': best_model.feature_importances_
        }).sort_values('importance',ascending=False)
        print('Feature Importance:\n',feature_importance)


      return df_metrics

    except Exception as ex:
      print(f"Exception: {ex}")
    finally:
      print('-'*50)

  def Register_BestModel_HF(self):
    print(f"Function Name {inspect.currentframe().f_code.co_name}")
    try:
      best_model = self.models[self.best_model_name]['model']
      joblib.dump(best_model,f'{self.base_path}/Model_Dump_JOBLIB/BestModel_{self.best_model_name}.joblib')


      api = HfApi()
      try:
        api.repo_info(repo_id=self.repo_id,repo_type=self.repo_type)
      except RepositoryNotFoundError:
        api.create_repo(repo_id=self.repo_id, repo_type=self.repo_type,private=False)


      print("Uploading the best model into Hugging face")
      api.upload_file(path_or_fileobj = f'{self.base_path}/Model_Dump_JOBLIB/BestModel_{self.best_model_name}.joblib',
                      path_in_repo = f"Model_Dump_JOBLIB/BestModel_{self.best_model_name}.joblib",
                      repo_id=self.repo_id, repo_type=self.repo_type
                      )


      print("Uploading the best threshold text file to HF")
      with open('Master/Model_Dump_JOBLIB/best_threshold.txt','w') as f:
        f.write(str(self.best_model_threshold))
      api.upload_file(path_or_fileobj = f"{self.base_path}/Model_Dump_JOBLIB/best_threshold.txt",
                      path_in_repo = f"Model_Dump_JOBLIB/best_threshold.txt",
                      repo_id=self.repo_id, repo_type=self.repo_type
                      )

      return True


    except Exception as ex:
      print(f"Exception: {ex}")
      traceback.print_exc()
      return False
    finally:
      print('-'*50)

  def ToRunPipeline(self):
    print(f"Function Name {inspect.currentframe().f_code.co_name}")
    try:
      if not self.Load_data_from_HF():
        return False
      else:
        if not self.Preprocessing_dataset():
          return False
        else:
          Build_Model = self.Building_Models()
          print(Build_Model)
          if Build_Model:
            df_Metrics = self.Model_Evaluation()
            print(df_Metrics)
            if not df_Metrics.empty:
              if self.Register_BestModel_HF():
                return True
              else:
                return False
            else:
              return False
          else:
            return False
    except Exception as ex:
      print(f'Exception occured {ex}')
    finally:
      print('-'*50)


# Main Function

In [68]:
import os
import logging
from google.colab import drive
from google.colab import userdata

def main():

  drive.mount('/content/drive/')
  %cd '/content/drive/MyDrive/PGP_AI_ML_GREAT_LEARNING/10_Advance_Machine_Learning_And_MLOps/Final_Project/VisitWithUs-Tourism_version_1_1/'
  base_path = 'Master/'
  print(f"Base Path {base_path}")

  data_path = os.path.join(base_path, 'Data')
  model_joblib_path = os.path.join(base_path, 'Model_Dump_JOBLIB')

  print("checking for Program folder created or not")
  if not os.path.exists(base_path):
    os.makedirs(base_path, exist_ok=True)
  else:
    print(f"folder already exists {base_path}")

  if not os.path.exists(data_path):
    os.makedirs(data_path, exist_ok=True)
  print(f"folder already exists {data_path}")

  if not os.path.exists(model_joblib_path):
    os.makedirs(model_joblib_path, exist_ok=True)
  print(f"folder already exists {model_joblib_path}")

  try:
    hf_token = userdata.get('HF_Token')
  except Exception as ex:
    print(f"Exception: {ex}")
    raise ValueError(f"HF TOKEN not found in the colab secrets")

  # DATA REGISTRATION
  try:
    ObjDataReg = DataRegistration(base_path,hf_token)
    if not ObjDataReg.ToRunPipeline():
      print("Exception Data Registration")
      return False
    else:
      print("Data Registration Completed")
      return True
  except Exception as ex:
    logging.error(f"Exception {ex}")

  # DATA PREPRATION
  try:
    ObjDataPrep = DataPrepration(base_path,hf_token)
    if not ObjDataPrep.ToRunPipeline():
      logging.error("Data Registration failed")
      return False
    else:
      logging.info("Data Registration Completed")
      return True
  except Exception as ex:
    logging.error(f"Exception {ex}")
    return False


  # MODEL BUILDING
  try:
    ObjMdlBuild = BuildingModels(base_path,hf_token)
    if not ObjMdlBuild.ToRunPipeline():
      print("Exception Model Building")
      return False
    else:
      print("Model Building Completed")
      return True
  except Exception as ex:
    logging.error(f"Exception {ex}")
    return False



if __name__ == "__main__":
  main()



Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/MyDrive/PGP_AI_ML_GREAT_LEARNING/10_Advance_Machine_Learning_And_MLOps/Final_Project/VisitWithUs-Tourism_version_1_1
Base Path Master/
checking for Program folder created or not
folder already exists Master/
folder already exists Master/Data
folder already exists Master/Model_Dump_JOBLIB


ERROR:root:Exception name 'DataRegistration' is not defined
ERROR:root:Exception name 'DataPrepration' is not defined


In [69]:
!pwd

/content/drive/MyDrive/PGP_AI_ML_GREAT_LEARNING/10_Advance_Machine_Learning_And_MLOps/Final_Project/VisitWithUs-Tourism_version_1_1


In [70]:
!ls

Master


In [71]:
%%writefile .github/workflows/pipeline.yml
name: VISIT WITH US TOURISM PREDICTION PIPELINE

on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]

jobs:
  data_registration:
    runs-on: ubuntu-latest
    steps:
    - name: Checkout Repository
      uses: actions/checkout@v4

    - name: Set up Python
      uses: actions/setup-python@v4
      with:
        python-version: '3.12'

    - name: Install Dependencies
      run: |
        python -m pip install --upgrade pip
        pip install jupyter nbconvert huggingface_hub pandas

    - name: SET UP FOLDER STRUCTURE
      run: |
        mkdir -p VisitWithUs-Tourism_version_1_1/Master/Data
        mkdir -p VisitWithUs-Tourism_version_1_1/Master/Model_Dump_JOBLIB
        if [ -f "tourism.csv"]; then
          cp tourism.csv VisitWithUs-Tourism_version_1_1/Master/Data/tourism.csv
        else
          echo "Error: tourism.csv not found"
          exit 1
        fi

    - name: CONVERT NOTEBOOK TO PYTHON SCRIPT
      run: |
        jupyter nbconvert --to python Visit-With-Us-Tourism-Prediction_v1_1.ipynb --output pipeline_script

    - name: RUN PIPELINE SCRIPT FOR DATA REGISTRATION
      env:
        HF_TOKEN: ${{ secrets.HF_TOKEN }}
      run: |
        echo "import logging" > run_data_registration.py
        echo "logging.basicConfig(level=logging.INFO, format='%(asctime)s -%(levelname)s -%(message)s' , filename ='data_registration.log')" >> run_data_registration.py
        echo "logging.info('Starting DataRegistation')" >> run_data_registration.py
        echo "from DataRegistation import DataRegistration" >> run_data_registration.py
        echo "base_path = 'VisitWithUs-Tourism_version_1_1/Master/'" >> run_data_registration.py
        echo "dr = DataRegistration(base_path=base_path,hf_token='$HF_TOKEN')" >> run_data_registration.py
        echo "dr.ToRunPipeline()" >> run_data_registration.py
        echo "logging.info('DataRegistration failed')" >> run_data_registration.py
        echo "exit 1" >> run_data_registration.py
        python run_data_registration.py

    - name: Upload logs and script
      if: always()
      uses: actions/upload-artifact@v4
      with:
        name: data-registration-outputs
        path: |
          data_registration.log
          run_data_registration.py

Writing .github/workflows/pipeline.yml


#Front End Implementation

In [None]:
pip install streamlit

In [None]:
%%writefile Master/Deployment/requirements.txt
pandas
numpy
scikit-learn
joblib
streamlit
huggingface_hub
xgboost

In [None]:
%%writefile Master/Deployment/README.md
---
title: Visit With Us - Tourism package prediction
emoji: 🚩
colorFrom: blue
colorTo: green
sdk: docker
sdk_version: 3.9
app_file: app.py
app_type: streamlit
pinned: false
license: mit
---
The streamlit app predicts the customer will purchace the tourism package

In [None]:
%%writefile Master/Deployment/Dockerfile
# Use a minimal base image with Python 3.9 installed
FROM python:3.9-slim

# Set the working directory inside the container to /app
WORKDIR /app

# Copy all files from the current directory on the host to the container's /app directory
COPY . .

# Install Python dependencies listed in requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
RUN mkdir -p /tmp/hf_cache && chmod -R 777 /tmp/hf_cache
ENV HF_HOME=/tmp/hf_cache
ENV HUGGINGFACE_HUB_CACHE=/tmp/hf_cache
ENV PYTHONUNBUFFERED=1


EXPOSE 7860


# Define the command to run the Streamlit app on port "7860" and make it accessible externally
CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0", "--server.enableXsrfProtection=false"]

In [None]:
%%writefile Master/Deployment/app.py
import streamlit as st
import pandas as pd
import joblib
import os
import logging
from huggingface_hub import login,hf_hub_download
from xgboost import XGBClassifier
#from google.colab import userdata

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
cache_dir = "/tmp/hf_cache"
os.environ["HF_HOME"] = cache_dir
os.environ["HUGGINGFACE_HUB_CACHE"] = cache_dir

try:
  hf_token = os.getenv("HUGGINGFACE_TOKEN")

  if hf_token:
    login(token=hf_token)
    logger.info("Successfully logged in to Hugging Face")
  else:
    logger.error("Hugging face token not found")
    st.error("Huggingface token not found")
except Exception as ex:
  logger.error(f"Failed to login to Hugging face: {ex} ")
  st.write(f"Failed to login to Hugging face: {ex} ")

try:
  os.makedirs(cache_dir, exist_ok=True)
  logger.info(f"Created cache directory {cache_dir}")
except Exception as ex:
  logger.error(f"Failed to create cache directory {cache_dir}: {ex}")
  st.error(f"Failed to create cache directory {cache_dir}: {ex}")


st.title("Visit with Us: Tourism Package Prediction")
st.write("Enter the Customer details to predict the likehood of purchasing the tourism packages")


if 'predictor' not in st.session_state:
  st.session_state.predictor = None
  st.session_state.model_loaded = False

class PredictorTourism:

  def __init__(self):
    self.Subfolders = 'Master'
    self.repoID = 'jpkarthikeyan/Tourism_Prediction_Model'
    self.model = None
    self.best_threshold = 0.0

  def Load_Model(self):
    try:
      logger.info("Loading best model")
      model_path = hf_hub_download(
          repo_id = self.repoID,filename = f'Model_Dump_JOBLIB/BestModel_XGBoostingClassifier.joblib',
          repo_type = 'model')
      threshold_path = hf_hub_download(
          repo_id = self.repoID, filename=f'Model_Dump_JOBLIB/best_threshold.txt',
          repo_type='model')

      logger.info(f"Model path: {model_path}")
      logger.info(f"Threshold path:  {threshold_path}")

      self.model = joblib.load(model_path)
      with open(threshold_path,'r') as f:
        self.best_threshold = float(f.read())
      st.success("Model and threshold loaded successfully")
      return True

    except Exception as ex:
      st.error(f'Exception: {ex}')
      return False


  def Predict(self, data):
    try:
      logger.info(f"Input Data: {data}")
      df= pd.DataFrame([data])
      logger.info(f"Data shape: {df.shape}")
      logger.info(f"Dataframe columns: {df.columns.tolist()}")
      prob = self.model.predict_proba(df)[:,1]
      prediction = int(prob >= self.best_threshold)
      return prediction

    except Exception as ex:
      logger.error(f"Exception in predict: {ex}", exc_info=True)
      st.error(f"Exception Prediction: {ex} {traceback.print_exc}")
      return ex


if not st.session_state.model_loaded:
  st.session_state.predictor = PredictorTourism()
  st.session_state.model_loaded = st.session_state.predictor.Load_Model()

with st.form("customer_form"):
  st.header("Customer Details")
  col1, col2,col3 = st.columns(3)

  with col1:

    age = st.number_input("Age", min_value=18, max_value=100, value=41)
    gender = st.selectbox('Gender',['Male','Female'])
    MaritalStatus = st.selectbox('MaritalStatus',['Married','Unmarried','Single','Divorced'])
    Occupation = st.selectbox('Occupation',['Free Lancer','Salaried','Small Business','Large Business'])
    Designation = st.selectbox('Designation',['AVP','Manager','Executive','Senior Manager','VP'])
    MonthlyIncome = st.number_input('MonthlyIncome',min_value=0, max_value=1000000,value=20999)

  with col2:

    typeofcontact = st.selectbox("TypeofContact",['Self Enquiry','Company Invited'])
    citytier = st.selectbox('citytier',[1,2,3], index=2)
    DurationOfPitch = st.number_input('DurationOfPitch', min_value=1, max_value=60, value=6)
    ProductPitched = st.selectbox('ProductPitched',['Deluxe','Basic','Standard','Super Deluxe','King'])
    PreferredPropertyStar = st.selectbox("'PreferredPropertyStar",[3,2,1])
    NumberOfTrips = st.number_input('NumberOfTrips',min_value=0, max_value=30, value=1)


  with col3:
    NumberOfPersonVisiting = st.number_input('NumberOfPersonVisiting',min_value=1,max_value=10,value=3)
    NumberOfFollowups = st.number_input('NumberOfFollowups',min_value=0,max_value=10, value=3)
    NumberOfChildrenVisiting= st.number_input('NumberOfChildrenVisiting',min_value=0,max_value=5,value=0)
    Passport= st.selectbox('Passport',['Yes','No'],format_func=lambda x:"Yes" if x=="Yes" else "No")
    Owncar= st.selectbox('OwnCar',['Yes','No'],format_func=lambda x:"Yes" if x=="Yes" else "No")
    PitchSatisfactionScore= st.number_input('PitchSatisfactionScore',min_value=1,max_value=5,value=3)


  submitted = st.form_submit_button("Predict")

if submitted:
  input_data = {
      'Age':age,
      'TypeofContact':typeofcontact,
      'CityTier':citytier,
      'DurationOfPitch':DurationOfPitch,
      'Occupation':Occupation,
      'Gender':gender,
      'NumberOfPersonVisiting':NumberOfPersonVisiting,
      'NumberOfFollowups':NumberOfFollowups,
      'ProductPitched':ProductPitched,
      'PreferredPropertyStar':PreferredPropertyStar,
      'MaritalStatus':MaritalStatus,
      'NumberOfTrips':NumberOfTrips,
      'Passport':1 if Passport =="Yes" else 0,
      'OwnCar':1 if Owncar =="Yes" else 0,
      'PitchSatisfactionScore':PitchSatisfactionScore,
      'NumberOfChildrenVisiting':NumberOfChildrenVisiting,
      'Designation':Designation,
      'MonthlyIncome':MonthlyIncome

  }


  if st.session_state.predictor:
    result = st.session_state.predictor.Predict(input_data)

    if result is not None:
      st.subheader(f"Prediction Result is {result}")
      st.write(f"Likely to purchase" if result ==1 else "Unlikely to purchase")
    else:
      st.write(result)
      st.error("Error in prediction")
  else:
    st.error("Models are not loaded, please ensure the model and threshold are available on Hugging face")



In [None]:
class Hosting:
  def HostingHFSpace(self):
    print(f"Function Name {inspect.currentframe().f_code.co_name}")
    api = HfApi()
    repo_id = 'jpkarthikeyan/Tourism-Prediction-Model-Space'
    directory_to_upload = '/content/drive/MyDrive/PGP_AI_ML_GREAT_LEARNING/10_Advance_Machine_Learning_And_MLOps/Final_Project/VisitWithUs-Tourism/Master/'

    try:
      api.repo_info(repo_id = repo_id, repo_type='space')
      print(f"Space {'repo_id'} already existis")
    except RepositoryNotFoundError:
      create_repo(repo_id= repo_id, repo_type='space',
                       space_sdk= 'docker', private = False)
      print(f"Space created {repo_id}")

    api.upload_folder(repo_id = repo_id, folder_path = f'{directory_to_upload}/Deployment/',
                      repo_type='space')


if __name__ == '__main__':
  ObjSpace = Hosting()
  ObjSpace.HostingHFSpace()