# Use a dumb keyword approach to catagorize complaint

In [1]:
# First, let's read the uploaded CSV file to see what the data looks like
import pandas as pd

# Read the CSV file
file_path = 'complaints_test.csv'
complaints_df = pd.read_csv(file_path)

# Show the first few rows of the dataframe to understand its structure
complaints_df.head()


Unnamed: 0.1,Unnamed: 0,Complaint
0,0,"I was charged a monthly maintenance fee, and I..."
1,1,The ATM near my house is always out of service.
2,2,Your mobile app keeps crashing whenever I try ...
3,3,"I applied for a credit card, and it's been thr..."
4,4,I can't seem to understand the fee structure o...


In [2]:
# Define a dictionary to hold keywords for each category
category_keywords = {
    'General Banking Services': ['bank', 'branch', 'maintenance', 'service', 'account'],
    'Credit Cards': ['credit card', 'interest rate', 'credit limit', 'charge'],
    'Loans and Mortgages': ['loan', 'mortgage', 'interest', 'repayment', 'installment'],
    'Investment Services': ['investment', 'portfolio', 'stock', 'mutual fund', 'capital'],
    'Online and Mobile Banking': ['online', 'mobile app', 'website', 'login', 'password'],
    'Business Banking': ['business', 'corporate', 'enterprise', 'commercial'],
    'Customer Service': ['customer service', 'call center', 'support', 'help'],
    'International Banking': ['international', 'foreign', 'exchange rate', 'currency'],
    'Fraud and Security': ['fraud', 'scam', 'security', 'unauthorized', 'hack'],
    'Savings and Deposit Accounts': ['savings', 'deposit', 'withdrawal', 'balance'],
    'Retirement Accounts': ['retirement', 'pension', '401k', 'IRA'],
    'Insurance Services': ['insurance', 'coverage', 'claim', 'policy'],
    'Additional Services': ['additional', 'extra', 'fee', 'charge', 'service']
}

def categorize_complaint(complaint_text):
    """
    Categorizes a complaint based on keywords.
    """
    categories = []
    for category, keywords in category_keywords.items():
        for keyword in keywords:
            if keyword.lower() in complaint_text.lower():
                categories.append(category)
                break  # Break the inner loop to avoid duplicate categories
    if not categories:
        categories.append('Uncategorized')
    return ', '.join(categories)

# Apply the categorization function to the 'Complaint' column
complaints_df['Category'] = complaints_df['Complaint'].apply(categorize_complaint)

# Drop the 'Unnamed: 0' column as it seems to be an index
complaints_df.drop(columns=['Unnamed: 0'], inplace=True)

# Show the first few rows of the categorized data
complaints_df.head()


Unnamed: 0,Complaint,Category
0,"I was charged a monthly maintenance fee, and I...","General Banking Services, Credit Cards, Additi..."
1,The ATM near my house is always out of service.,"General Banking Services, Additional Services"
2,Your mobile app keeps crashing whenever I try ...,Online and Mobile Banking
3,"I applied for a credit card, and it's been thr...",Credit Cards
4,I can't seem to understand the fee structure o...,"Credit Cards, Additional Services"


In [3]:
import numpy as np

def categorize_complaint_with_probability(complaint_text):
    """
    Categorizes a complaint based on keywords and gives a probability score.
    """
    category_scores = {}
    total_score = 0
    
    for category, keywords in category_keywords.items():
        category_score = 0
        for keyword in keywords:
            if keyword.lower() in complaint_text.lower():
                category_score += 1
        category_scores[category] = category_score
        total_score += category_score
    
    if total_score == 0:
        return 'Uncategorized', 1.0  # If no keywords are matched, the category is 'Uncategorized' with probability 1.0
    
    # Find the category with the highest score
    best_category = max(category_scores, key=category_scores.get)
    best_score = category_scores[best_category]
    
    # Calculate the probability
    probability = best_score / total_score
    
    return best_category, np.round(probability, 2)

# Apply the categorization function to the 'Complaint' column
complaints_df[['Best Category', 'Probability']] = complaints_df['Complaint'].apply(
    lambda x: pd.Series(categorize_complaint_with_probability(x))
)

# Show the first few rows of the categorized data with probabilities
complaints_df.head()


Unnamed: 0,Complaint,Category,Best Category,Probability
0,"I was charged a monthly maintenance fee, and I...","General Banking Services, Credit Cards, Additi...",Additional Services,0.5
1,The ATM near my house is always out of service.,"General Banking Services, Additional Services",General Banking Services,0.5
2,Your mobile app keeps crashing whenever I try ...,Online and Mobile Banking,Online and Mobile Banking,1.0
3,"I applied for a credit card, and it's been thr...",Credit Cards,Credit Cards,1.0
4,I can't seem to understand the fee structure o...,"Credit Cards, Additional Services",Credit Cards,0.5
