In [10]:
import numpy as np
import pandas as pd
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from abc import ABC, abstractmethod

class Config:
    
    TICKET_SUMMARY = 'Ticket Summary'
    INTERACTION_CONTENT = 'Interaction content'
    
    TYPE_COLS = ['y2', 'y3', 'y4']
    CLASS_COL = 'y2'
    GROUPED = 'y1'

def get_input_data():
    df1 = pd.read_csv("/kaggle/input/newnew/Actvity 3 Full Solution/data/AppGallery.csv", skipinitialspace=True)
    df1.rename(columns={'Type 1': 'y1', 'Type 2': 'y2', 'Type 3': 'y3', 'Type 4': 'y4'}, inplace=True)
    df2 = pd.read_csv("/kaggle/input/newnew/Actvity 3 Full Solution/data/Purchasing.csv", skipinitialspace=True)
    df2.rename(columns={'Type 1': 'y1', 'Type 2': 'y2', 'Type 3': 'y3', 'Type 4': 'y4'}, inplace=True)
    df = pd.concat([df1, df2])
    df[Config.INTERACTION_CONTENT] = df[Config.INTERACTION_CONTENT].values.astype('U')
    df[Config.TICKET_SUMMARY] = df[Config.TICKET_SUMMARY].values.astype('U')
    df["y"] = df[Config.CLASS_COL]
    df = df.loc[(df["y"] != '') & (~df["y"].isna()),]
    return df

def get_tfidf_embd(df):
    tfidfconverter = TfidfVectorizer(max_features=2000, min_df=4, max_df=0.90)
    data = df[Config.TICKET_SUMMARY] + ' ' + df[Config.INTERACTION_CONTENT]
    X = tfidfconverter.fit_transform(data).toarray()
    return X

def combine_embd(X1, X2):
    return np.concatenate((X1, X2), axis=1)


class BaseModel(ABC):
    @abstractmethod
    def train(self):
        pass
    @abstractmethod
    def predict(self):
        pass
    @abstractmethod
    def data_transform(self):
        pass

class RandomForest(BaseModel):
    def __init__(self, model_name, embeddings, y):
        self.mdl = RandomForestClassifier(n_estimators=1000, random_state=0)
        self.model_name = model_name
        self.embeddings = embeddings
        self.y = y
        self.predictions = None

    def train(self, X_train, y_train):
        self.mdl = self.mdl.fit(X_train, y_train)

    def predict(self, X_test):
        self.predictions = self.mdl.predict(X_test)
        return self.predictions

    def print_results(self, y_test):
        print(classification_report(y_test, self.predictions))

    def data_transform(self):
        pass


if __name__ == '__main__':
    random.seed(0)
    np.random.seed(0)
    df = get_input_data()
    X = get_tfidf_embd(df)
    y = df['y'].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    rf_model = RandomForest("RandomForest", X_train, y_train)
    rf_model.train(X_train, y_train)
    predictions = rf_model.predict(X_test)
    rf_model.print_results(y_test)


               precision    recall  f1-score   support

       Others       0.80      0.57      0.67         7
Problem/Fault       0.67      0.73      0.70        11
   Suggestion       0.84      0.88      0.86        24

     accuracy                           0.79        42
    macro avg       0.77      0.72      0.74        42
 weighted avg       0.79      0.79      0.78        42



In [11]:
class ChainedModel:
    def __init__(self, models):
        self.models = models 

    def predict(self, X, y_true):
       
        type2_pred = self.models[0].predict(X)
        accuracy_type2 = accuracy_score(y_true['Type2'], type2_pred)
        print(f"Accuracy for Type 2 prediction: {accuracy_type2}", flush=True)

        
        X_type3 = np.column_stack((X, type2_pred))
        type3_pred = self.models[1].predict(X_type3)
        accuracy_type3 = accuracy_score(y_true['Type3'], type3_pred)
        print(f"Accuracy for Type 3 prediction: {accuracy_type3}", flush=True)

        X_type4 = np.column_stack((X, type2_pred, type3_pred))
        type4_pred = self.models[2].predict(X_type4)
        accuracy_type4 = accuracy_score(y_true['Type4'], type4_pred)
        print(f"Accuracy for Type 4 prediction: {accuracy_type4}", flush=True)

        return type2_pred, type3_pred, type4_pred


In [13]:
def get_input_data():
  
    df1 = pd.read_csv("/kaggle/input/newnew/Actvity 3 Full Solution/data/AppGallery.csv", skipinitialspace=True)
    df1.rename(columns={'Type 1': 'y1', 'Type 2': 'y2', 'Type 3': 'y3', 'Type 4': 'y4'}, inplace=True)
    df2 = pd.read_csv("/kaggle/input/newnew/Actvity 3 Full Solution/data/Purchasing.csv", skipinitialspace=True)
    df2.rename(columns={'Type 1': 'y1', 'Type 2': 'y2', 'Type 3': 'y3', 'Type 4': 'y4'}, inplace=True)
    
    
    df = pd.concat([df1, df2])
    
   
    df[Config.INTERACTION_CONTENT] = df[Config.INTERACTION_CONTENT].astype(str)
    df[Config.TICKET_SUMMARY] = df[Config.TICKET_SUMMARY].astype(str)
    
    
    df = df.dropna(subset=Config.TYPE_COLS)  
    
    return df

def get_tfidf_embd(df):
    data = df[Config.TICKET_SUMMARY] + ' ' + df[Config.INTERACTION_CONTENT]
    
    
    tfidfconverter = TfidfVectorizer(max_features=2000, min_df=4, max_df=0.90)
    X = tfidfconverter.fit_transform(data).toarray()
    
    return X


class ChainedMultiOutputsModel:
    def __init__(self):
        self.models = {col: RandomForestClassifier(n_estimators=1000, random_state=0) for col in Config.CLASS_COLS}

    def train(self, X_train, y_train):
        
        for col in Config.CLASS_COLS:
            self.models[col].fit(X_train, y_train[col])

    def predict(self, X_test):
       
        predictions = {col: self.models[col].predict(X_test) for col in Config.CLASS_COLS}
        return predictions

    def print_results(self, y_test, predictions):
        
        for col in Config.CLASS_COLS:
            accuracy = accuracy_score(y_test[col], predictions[col])
            print(f"Accuracy for {col}: {accuracy}")


if __name__ == '__main__':
  
    random.seed(0)
    np.random.seed(0)
   
    df = get_input_data()

    X = get_tfidf_embd(df)
    y = df[Config.CLASS_COLS]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
   
    model = ChainedMultiOutputsModel()
    model.train(X_train, y_train)
    
  
    predictions = model.predict(X_test)
    

    model.print_results(y_test, predictions)


Accuracy for y2: 0.7272727272727273
Accuracy for y3: 0.696969696969697
Accuracy for y4: 0.6060606060606061


In [19]:
def get_input_data():
    # Read data from CSV files
    df1 = pd.read_csv("/kaggle/input/newnew/Actvity 3 Full Solution/data/AppGallery.csv", skipinitialspace=True)
    df1.rename(columns={'Type 1': 'y1', 'Type 2': 'y2', 'Type 3': 'y3', 'Type 4': 'y4'}, inplace=True)
    df2 = pd.read_csv("/kaggle/input/newnew/Actvity 3 Full Solution/data/Purchasing.csv", skipinitialspace=True)
    df2.rename(columns={'Type 1': 'y1', 'Type 2': 'y2', 'Type 3': 'y3', 'Type 4': 'y4'}, inplace=True)
    
    # Concatenate the two dataframes
    df = pd.concat([df1, df2])
    
    # Convert columns to string type
    df[Config.INTERACTION_CONTENT] = df[Config.INTERACTION_CONTENT].astype(str)
    df[Config.TICKET_SUMMARY] = df[Config.TICKET_SUMMARY].astype(str)
    
    # Filter out rows with missing target values
    df = df.dropna(subset=Config.TYPE_COLS)
    
    # Encode categorical variables
    df = pd.get_dummies(df, columns=Config.TYPE_COLS)
    
    return df

def get_tfidf_embd(df):
    # Combine text columns
    data = df[Config.TICKET_SUMMARY] + ' ' + df[Config.INTERACTION_CONTENT]
    
    # Convert text data into TF-IDF features
    tfidfconverter = TfidfVectorizer(max_features=2000, min_df=4, max_df=0.90)
    X = tfidfconverter.fit_transform(data).toarray()
    
    return X

# Define the HierarchicalModel class
class HierarchicalModel:
    def __init__(self):
        self.models = {}

    def train(self, X_train, y_train):
        # Train models hierarchically with one-hot encoded outputs
        for i, col_prefix in enumerate(Config.TYPE_COLS):
            model = RandomForestClassifier(n_estimators=1000, random_state=0)
            current_cols = [col for col in y_train.columns if col.startswith(col_prefix)]
            if i == 0:
                model.fit(X_train, y_train[current_cols])
            else:
                prev_col_prefix = Config.TYPE_COLS[i-1]
                prev_predictions = self.models[prev_col_prefix].predict(X_train)
                # Ensure prev_predictions is 2D for hstack
                if prev_predictions.ndim == 1:
                    prev_predictions = prev_predictions[:, np.newaxis]
                X_train = np.hstack([X_train, prev_predictions])
                model.fit(X_train, y_train[current_cols])
            self.models[col_prefix] = model

    def predict(self, X_test):
        # Make hierarchical predictions
        predictions = {}
        for i, col_prefix in enumerate(Config.TYPE_COLS):
            if i == 0:
                predictions[col_prefix] = self.models[col_prefix].predict(X_test)
            else:
                prev_col_prefix = Config.TYPE_COLS[i-1]
                prev_predictions = predictions[prev_col_prefix]
                # Ensure prev_predictions is 2D for hstack
                if prev_predictions.ndim == 1:
                    prev_predictions = prev_predictions[:, np.newaxis]
                X_test = np.hstack([X_test, prev_predictions])
                predictions[col_prefix] = self.models[col_prefix].predict(X_test)
        return predictions

    def print_results(self, y_test, predictions):
        # Print accuracy for each classification type
        for col_prefix in Config.TYPE_COLS:
            current_cols = [col for col in y_test.columns if col.startswith(col_prefix)]
            accuracy = accuracy_score(y_test[current_cols], predictions[col_prefix])
            print(f"Accuracy for {col_prefix}: {accuracy}")

# Main Execution Logic
if __name__ == '__main__':
    random.seed(0)
    np.random.seed(0)
    df = get_input_data()
    X = get_tfidf_embd(df)
    y = df[[col for col in df.columns if col not in [Config.TICKET_SUMMARY, Config.INTERACTION_CONTENT] + Config.TYPE_COLS]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
    # Initialize and train the hierarchical model
    model = HierarchicalModel()
    model.train(X_train, y_train)
    
    # Make predictions
    predictions = model.predict(X_test)
    
    # Print results
    model.print_results(y_test, predictions)

Accuracy for y2: 0.7272727272727273
Accuracy for y3: 0.48484848484848486
Accuracy for y4: 0.42424242424242425
