# CLEANING AND JOIN FILES FOR TRAINING

In [11]:
# W W W W W W W W W WW W W W WW W W W W W WWW W W W W WW WW W
 
    
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd

df  =  pd.read_csv("new_calls.csv")

# CLEANING STEP 1

def filter_transcript(transcript):
    # Split the transcript into individual lines
    lines = transcript.splitlines()
    # Keep only the lines that start with 'Customer:' or 'Agent:'
    filtered_lines = [line for line in lines if line.startswith("Customer:") or line.startswith("Agent:")]
    # Join the filtered lines back into a single string
    return "\n".join(filtered_lines)

# Apply the filter_transcript function to each row in the 'call_transcript' column
df['call_transcript'] = df['call_transcript'].apply(filter_transcript)




In [12]:
df2 =  pd.read_csv("reasons.csv")

df2['primary_call_reason'] = df2['primary_call_reason'].str.strip() # Remove extra spaces
df2['primary_call_reason'] = df2['primary_call_reason'].str.lower() # Convert to lowercase
df2['primary_call_reason'] = df2['primary_call_reason'].str.replace(r'\s+', ' ', regex=True)
df2['primary_call_reason'] = df2['primary_call_reason'].str.replace(r'&', 'and', regex=True)
df2['primary_call_reason'] = df2['primary_call_reason'].str.replace(r'-', ' ', regex=True)

df3 =  pd.read_csv("testbc.csv")  # TEST DATA STORED HERE


In [13]:
df3.count()

call_id    5157
dtype: int64

In [14]:
# ADDING TRANSCRIPTS TO TEST CASES
df3 = df3.merge(df[['call_id','call_transcript']], on='call_id', how='left')
df3.head()

Unnamed: 0,call_id,call_transcript
0,7732610078,Agent: Thank you for calling United Airlines c...
1,2400299738,"Agent: Thank you for calling United Airlines, ..."
2,6533095063,Agent: Thank you for calling United Airlines c...
3,7774450920,"Agent: Thank you for calling United Airlines, ..."
4,9214147168,Agent: Thank you for calling United Airlines c...


In [15]:

df3.count()


call_id            5157
call_transcript    5157
dtype: int64

In [16]:
full_df = pd.merge(df, df2, how='inner', on='call_id')# MERGE CALL AND REASONS
full_df.count()

call_id                    66653
customer_id                66653
agent_id                   66653
call_start_datetime        66653
agent_assigned_datetime    66653
call_end_datetime          66653
call_transcript            66653
primary_call_reason        66653
dtype: int64

In [17]:
train_df = full_df[~full_df['call_id'].isin(df3['call_id'])]  # NOT NECESSARY AS INNER JOIN FROM REASONS FILTERS

In [18]:
train_df.head(5)
train_df.count()

call_id                    66653
customer_id                66653
agent_id                   66653
call_start_datetime        66653
agent_assigned_datetime    66653
call_end_datetime          66653
call_transcript            66653
primary_call_reason        66653
dtype: int64

In [19]:
train_df = train_df[['call_id','call_transcript','primary_call_reason']]
train_df.head(5)

Unnamed: 0,call_id,call_transcript,primary_call_reason
0,4667960400,Agent: Thank you for calling United Airlines c...,voluntary cancel
1,1122072124,"Agent: Thank you for calling United Airlines, ...",booking
2,6834291559,Agent: Thank you for calling United Airlines c...,irrops
3,2266439882,Agent: Thank you for calling United Airlines c...,upgrade
4,1211603231,Agent: Thank you for calling United Airlines c...,seating


# TRAINING AND TESTING THE MODEL

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
vectorizer = TfidfVectorizer()

# Transform the text data
X = vectorizer.fit_transform(train_df['call_transcript'])
y = train_df['primary_call_reason']

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.toarray()) # sparse to dense for scaling

# Handle class imbalance
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Model training
model = RandomForestClassifier(class_weight='balanced')
model.fit(X_train, y_train)

# Preprocess df3 (assuming df3 also has a 'call_transcript' column)
X_df3 = vectorizer.transform(df3['call_transcript'])  # Use the trained vectorizer
X_df3_scaled = scaler.transform(X_df3.toarray())      # Apply the same scaling

# Make predictions on df3
df3_predictions = model.predict(X_df3_scaled)

# Save predictions to a CSV file
df3['predictions'] = df3_predictions  # Add predictions to the dataframe
df3[['predictions']].to_csv('Test.csv', index=False)  # Save only predictions column

