In [None]:
# Preparing the environment and loading the dataset
# Install dependencies
%pip install -r ../requirements.txt

# Import necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import os

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Defines the path to the data directory
data_path = "c:/Users/owner/OneDrive/Varsity/2025/PGDDA/S2/PDAN/Projects/POE-Part-1/data"

# Create the directory if it doesn't exist
os.makedirs(data_path, exist_ok=True)

# Construct the full file path using os.path.join (recommended for cross-platform compatibility by Python documentation)
file_path = os.path.join(data_path, "train.csv")

# Load dataset
try:
    data = pd.read_csv(file_path)
    # Display the first few rows
    print("Dataset loaded successfully!")
    print(data.head())
    # A nifty recommendation if the file is not found
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    print("Please check the following:")
    print("1. Ensure the 'kaggle' command ran successfully and downloaded the files.")
    print("2. Check the data directory for the actual filenames. It should contain 'lda_features.csv'.")

Note: you may need to restart the kernel to use updated packages.
Dataset loaded successfully!
        id                                               text author
0  id26305  This process, however, afforded me no means of...    EAP
1  id17569  It never once occurred to me that the fumbling...    HPL
2  id11008  In his left hand was a gold snuff box, from wh...    EAP
3  id27763  How lovely is spring As we looked from Windsor...    MWS
4  id12958  Finding nothing else, not even gold, the Super...    HPL


In [2]:
# Prepare the data
# Display basic information about the dataset
# Check for missing values
missing_values = data.isnull().sum()
print("Missing values in each column:")
print(missing_values)

# If there are missing values, print a message and fill them
if missing_values.any():
    print("There are missing values in the dataset, using forward fill to handle them.")
    # Preprocess data (fill missing values, encode categorical variables)
    # Fill missing values with forward fill method
    data.fillna(method='ffill', inplace=True)
    print("Missing values after forward fill:")
    print(data.isnull().sum())
# One-hot encode the 'author' column before other encoding
print("Unique authors:", data['author'].unique())
author_dummies = pd.get_dummies(data['author'], prefix='author')
print("One-hot encoded author columns:", author_dummies.columns)
data = pd.concat([data.drop(columns=['author']), author_dummies], axis=1)
print(data.head())

Missing values in each column:
id        0
text      0
author    0
dtype: int64
Unique authors: ['EAP' 'HPL' 'MWS']
One-hot encoded author columns: Index(['author_EAP', 'author_HPL', 'author_MWS'], dtype='object')
        id                                               text  author_EAP  \
0  id26305  This process, however, afforded me no means of...        True   
1  id17569  It never once occurred to me that the fumbling...       False   
2  id11008  In his left hand was a gold snuff box, from wh...        True   
3  id27763  How lovely is spring As we looked from Windsor...       False   
4  id12958  Finding nothing else, not even gold, the Super...       False   

   author_HPL  author_MWS  
0       False       False  
1        True       False  
2       False       False  
3       False        True  
4        True       False  


In [None]:
#More imports
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Prepare input and target
X = data['text'].values
y = data[['author_EAP', 'author_HPL', 'author_MWS']].values

# Tokenize and pad text
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=200)

# Build the model
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=200),
    LSTM(64),
    Dense(3, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_pad, y, epochs=5, batch_size=32, validation_split=0.2)

Epoch 1/5




[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 33ms/step - accuracy: 0.6879 - loss: 0.6980 - val_accuracy: 0.8123 - val_loss: 0.4728
Epoch 2/5
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 41ms/step - accuracy: 0.8837 - loss: 0.3086 - val_accuracy: 0.8189 - val_loss: 0.4964
Epoch 3/5
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 41ms/step - accuracy: 0.9297 - loss: 0.1915 - val_accuracy: 0.8128 - val_loss: 0.5659
Epoch 4/5
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 44ms/step - accuracy: 0.9518 - loss: 0.1385 - val_accuracy: 0.8054 - val_loss: 0.8051
Epoch 5/5
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 56ms/step - accuracy: 0.9669 - loss: 0.0966 - val_accuracy: 0.8123 - val_loss: 0.7402


<keras.src.callbacks.history.History at 0x20cec84c830>

In [4]:
# Load test data
test_path = "../data/test.csv"
test_data = pd.read_csv(test_path)

print(test_data.head())

        id                                               text
0  id02310  Still, as I urged our leaving Ireland with suc...
1  id24541  If a fire wanted fanning, it could readily be ...
2  id00134  And when they had broken down the frail door t...
3  id27757  While I was thinking how I should possibly man...
4  id04081  I am not sure to what limit his knowledge may ...


In [5]:
# Randomly select 10 indices
random_indices = np.random.choice(test_data.index, size=10, replace=False)

for idx in random_indices:
    sample_text = [test_data.loc[idx, 'text']]
    sample_seq = tokenizer.texts_to_sequences(sample_text)
    sample_pad = pad_sequences(sample_seq, maxlen=200)
    pred = model.predict(sample_pad)
    print(f"Text: {sample_text[0][:60]}...")
    print(f"Predicted probabilities (EAP, HPL, MWS): {pred[0]}\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 145ms/step
Text: I resolutely and perseveringly kept my attention riveted upo...
Predicted probabilities (EAP, HPL, MWS): [0.9968522  0.00210096 0.00104676]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Text: Fortunately the village was small and the death rate low, so...
Predicted probabilities (EAP, HPL, MWS): [0.0015895  0.21289864 0.78551185]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Text: They searched, but found nothing....
Predicted probabilities (EAP, HPL, MWS): [0.8950657  0.10079294 0.00414137]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Text: The Portuguese sailors coming in from a voyage cross themsel...
Predicted probabilities (EAP, HPL, MWS): [0.73210263 0.2663502  0.00154712]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Text: The banks of the Thames presented a new scene; they were fla...
Predicte

In [6]:
# Example prediction using the random indices to predict authors by name
author_labels = ['EAP', 'HPL', 'MWS']

for idx in random_indices:
    sample_text = [test_data.loc[idx, 'text']]
    sample_seq = tokenizer.texts_to_sequences(sample_text)
    sample_pad = pad_sequences(sample_seq, maxlen=200)
    pred = model.predict(sample_pad)
    pred_idx = np.argmax(pred[0])
    pred_author = author_labels[pred_idx]
    print(f"Text: {sample_text[0][:60]}...")
    print(f"Predicted author: {pred_author}")
    print(f"Probabilities (EAP, HPL, MWS): {pred[0]}\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Text: I resolutely and perseveringly kept my attention riveted upo...
Predicted author: EAP
Probabilities (EAP, HPL, MWS): [0.9968522  0.00210096 0.00104676]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Text: Fortunately the village was small and the death rate low, so...
Predicted author: MWS
Probabilities (EAP, HPL, MWS): [0.0015895  0.21289864 0.78551185]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Text: They searched, but found nothing....
Predicted author: EAP
Probabilities (EAP, HPL, MWS): [0.8950657  0.10079294 0.00414137]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Text: The Portuguese sailors coming in from a voyage cross themsel...
Predicted author: EAP
Probabilities (EAP, HPL, MWS): [0.73210263 0.2663502  0.00154712]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Text: The banks of the Thames p

In [None]:
import time
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import skfuzzy as fuzz
from skfuzzy import control as ctrl
from tqdm import tqdm

# Configuration
# Using a smaller number for a quicker run.
NUM_KEYWORDS_PER_AUTHOR = 100
# Authors in the dataset
AUTHORS = ['EAP', 'HPL', 'MWS']

# Helper Functions
# Extract top keywords per author
def extract_top_keywords(train_df, authors, num_keywords):
    """Extracts top TF-IDF keywords for each author."""
    print("✅ Loaded training data with {} rows.".format(len(train_df)))
    
    # Aggregate all text for each author
    author_texts = train_df.groupby('author')['text'].apply(' '.join)
    print(f"✅ Aggregated text for {len(authors)} authors.")
    
    author_keywords = {}
    # Initialize TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words='english', max_features=num_keywords)
    
    for author in authors:
        # Fit and transform the text for the current author
        vectorizer.fit([author_texts[author]])
        # Get the feature names (keywords)
        keywords = vectorizer.get_feature_names_out()
        author_keywords[author] = keywords
        print(f"📘 {author}: {', '.join(keywords[:5])}...")
        
    return author_keywords
# Build fuzzy logic system
# CoPilot suggestion implementation and improvements using intellisense
def build_fuzzy_system(author_keywords, authors):
    """
    Builds the fuzzy logic control system and rules efficiently.
    This version correctly pre-builds all antecedents before creating rules.
    """
    # Setup Universes and Consequents (Outputs)
    keyword_count_universe = np.arange(0, 11, 1)
    score_universe = np.arange(0, 11, 1)
    
    output_objects = {}
    for author in authors:
        consequent = ctrl.Consequent(score_universe, f'score_{author}')
        consequent['low'] = fuzz.trimf(consequent.universe, [0, 0, 5])
        consequent['medium'] = fuzz.trimf(consequent.universe, [0, 5, 10])
        consequent['high'] = fuzz.trimf(consequent.universe, [5, 10, 10])
        output_objects[author] = consequent

    # Pre-build ALL Antecedents (Inputs)
    # Another change suggested by CoPilot, building all antecedents first and then referencing them
    # This is the critical change: create all input objects first.
    print("    - Defining all input variables (antecedents)...")
    antecedent_objects = {}
    all_author_keyword_pairs = [
        (author, keyword) 
        for author in authors 
        for keyword in author_keywords[author]
    ]
    for author, keyword in all_author_keyword_pairs:
        name = f'{author}_{keyword}_count'
        antecedent = ctrl.Antecedent(keyword_count_universe, name)
        antecedent['low'] = fuzz.trimf(antecedent.universe, [0, 0, 1])
        antecedent['medium'] = fuzz.trimf(antecedent.universe, [0, 2, 4])
        antecedent['high'] = fuzz.smf(antecedent.universe, 3, 5)
        antecedent_objects[name] = antecedent

    # Create Rules by Referencing Existing Objects (with Progress Bar)
    rules = []
    for author, keyword in tqdm(all_author_keyword_pairs, desc="    - Generating rules"):
        # Get the pre-built antecedent and consequent
        antecedent = antecedent_objects[f'{author}_{keyword}_count']
        author_output = output_objects[author]
        
        # Create rules linking them
        rules.append(ctrl.Rule(antecedent['medium'], author_output['medium']))
        rules.append(ctrl.Rule(antecedent['high'], author_output['high']))

    # Finalize the System
    print(f"    - Constructing the control system graph from {len(rules)} rules... (this may take a few minutes)")
    fuzzy_system = ctrl.ControlSystem(rules)
    fuzzy_system_sim = ctrl.ControlSystemSimulation(fuzzy_system)
    
    return fuzzy_system_sim


# --- Main Execution ---
if __name__ == '__main__':
    print("🚀 Starting fuzzy author classification...\n")
    
    # Load the training data
    train_data = pd.read_csv("../data/train.csv")
    train_df = pd.DataFrame(train_data)
    # Load the test data
    test_data = pd.read_csv("../data/test.csv")
    test_df = pd.DataFrame(test_data)

    # Extract top keywords per author
    print("🔍 Step 1: Extracting top keywords per author...")
    start_time = time.time()
    author_keywords = extract_top_keywords(train_df, AUTHORS, NUM_KEYWORDS_PER_AUTHOR)
    print(f"✅ Keyword extraction complete in {time.time() - start_time:.2f} seconds.\n")

    # Build the fuzzy logic system
    print("🧠 Step 2: Building fuzzy logic system...")
    start_time = time.time()
    fuzzy_system_sim = build_fuzzy_system(author_keywords, AUTHORS)
    print(f"✅ Fuzzy system built in {time.time() - start_time:.2f} seconds.\n")

    # Classify test data and evaluate
    print("🧪 Step 3: Classifying test data and evaluating...")
    print(f"✅ Loaded test data with {len(test_df)} rows.")

    # This segment of code has been heavily optimized by CoPilot for speed. I left the original comments for context.
    # ===================================================================
    # START: OPTIMIZED CLASSIFICATION BLOCK
    # ===================================================================
    
    print("\n📊 Pre-calculating keyword counts for the entire test set...")
    # Create a new DataFrame to hold keyword counts for all texts
    keyword_counts_df = pd.DataFrame(index=test_df.index)

    # Use pandas' vectorized string methods for a massive speedup
    all_unique_keywords = set(kw for kws in author_keywords.values() for kw in kws)
    for keyword in tqdm(all_unique_keywords, desc="🔍 Counting keywords"):
        # Count this keyword across all texts at once
        counts = test_df['text'].str.lower().str.count(r'\b' + keyword + r'\b') # Use word boundaries
        # Assign counts to the correct input columns
        for author in AUTHORS:
            if keyword in author_keywords[author]:
                input_name = f'{author}_{keyword}_count'
                keyword_counts_df[input_name] = counts
    
    print("✅ Keyword counts calculated.\n")

    # Classify using the pre-calculated counts
    results = []
    # Build output_names directly from AUTHORS
    output_names = [f'score_{author}' for author in AUTHORS]  # e.g., ['score_EAP', 'score_HPL', 'score_MWS']

    # Convert DataFrame to a list of dicts for faster iteration
    records = keyword_counts_df.to_dict('records')

    for i, record in enumerate(tqdm(records, desc="🔄 Classifying texts")):
        # Pass the pre-calculated counts for one text
        fuzzy_system_sim.inputs(record)
        
        # Compute the fuzzy logic output
        try:
            fuzzy_system_sim.compute()
            # Only include outputs that exist in fuzzy_system_sim.output
            scores = []
            for name in output_names:
                if name in fuzzy_system_sim.output:
                    scores.append(fuzzy_system_sim.output[name])
                else:
                    # If missing, assign a neutral score
                    scores.append(1.67)
        except ValueError:
            # This can happen if no rules are activated. Default to a neutral score.
            scores = [1.67] * len(AUTHORS)

        # --- FIX for tied scores ---
        max_score = np.max(scores)
        
        # Check if all scores are identical and low, indicating no keywords were found
        if all(abs(score - scores[0]) < 0.1 for score in scores) and max_score < 2.0:
            predicted_author = 'UNCLEAR' # Assign a specific label for low-confidence ties
        else:
            # Otherwise, pick the author with the highest score
            predicted_author = AUTHORS[np.argmax(scores)]

        results.append({
            'id': test_df.iloc[i]['id'],
            'predicted': predicted_author,
            'scores': {AUTHORS[j]: round(s, 2) for j, s in enumerate(scores)}
        })

    # ===================================================================
    # END: OPTIMIZED CLASSIFICATION BLOCK
    # ===================================================================

    print("\n✅ Classification complete.\n")
    print("📊 Sample Results:")
    
    # Display the results
    for res in results:
        scores_str = ", ".join([f"{author}={score}" for author, score in res['scores'].items()])
        print(f"ID: {res['id']}, Predicted: {res['predicted']}, Scores: {scores_str}")



🚀 Starting fuzzy author classification...

🔍 Step 1: Extracting top keywords per author...
✅ Loaded training data with 19579 rows.
✅ Aggregated text for 3 authors.
📘 EAP: air, altogether, appearance, appeared, attention...
📘 HPL: ancient, away, began, black, body...
📘 MWS: adrian, affection, air, appeared, away...
✅ Keyword extraction complete in 0.21 seconds.

🧠 Step 2: Building fuzzy logic system...
    - Defining all input variables (antecedents)...


    - Generating rules: 100%|██████████| 300/300 [00:00<00:00, 138243.38it/s]

    - Constructing the control system graph from 600 rules... (this may take a few minutes)





✅ Fuzzy system built in 417.36 seconds.

🧪 Step 3: Classifying test data and evaluating...
✅ Loaded test data with 8392 rows.

📊 Pre-calculating keyword counts for the entire test set...


  keyword_counts_df[input_name] = counts
  keyword_counts_df[input_name] = counts
  keyword_counts_df[input_name] = counts
  keyword_counts_df[input_name] = counts
  keyword_counts_df[input_name] = counts
  keyword_counts_df[input_name] = counts
  keyword_counts_df[input_name] = counts
  keyword_counts_df[input_name] = counts
  keyword_counts_df[input_name] = counts
  keyword_counts_df[input_name] = counts
  keyword_counts_df[input_name] = counts
  keyword_counts_df[input_name] = counts
  keyword_counts_df[input_name] = counts
  keyword_counts_df[input_name] = counts
  keyword_counts_df[input_name] = counts
  keyword_counts_df[input_name] = counts
  keyword_counts_df[input_name] = counts
  keyword_counts_df[input_name] = counts
  keyword_counts_df[input_name] = counts
  keyword_counts_df[input_name] = counts
  keyword_counts_df[input_name] = counts
  keyword_counts_df[input_name] = counts
  keyword_counts_df[input_name] = counts
  keyword_counts_df[input_name] = counts
  keyword_counts

✅ Keyword counts calculated.



🔄 Classifying texts: 100%|██████████| 8392/8392 [4:54:48<00:00,  2.11s/it]  


✅ Classification complete.

📊 Sample Results:
ID: id02310, Predicted: EAP, Scores: EAP=5.0, HPL=5.0, MWS=5.0
ID: id24541, Predicted: EAP, Scores: EAP=5.0, HPL=5.0, MWS=5.0
ID: id00134, Predicted: EAP, Scores: EAP=5.0, HPL=5.0, MWS=5.0
ID: id27757, Predicted: EAP, Scores: EAP=5.0, HPL=5.0, MWS=1.67
ID: id04081, Predicted: EAP, Scores: EAP=5.0, HPL=1.67, MWS=1.67
ID: id27337, Predicted: EAP, Scores: EAP=5.0, HPL=1.67, MWS=1.67
ID: id24265, Predicted: EAP, Scores: EAP=5.0, HPL=5.0, MWS=1.67
ID: id25917, Predicted: EAP, Scores: EAP=5.0, HPL=5.0, MWS=5.0
ID: id04951, Predicted: EAP, Scores: EAP=5.0, HPL=5.0, MWS=5.0
ID: id14549, Predicted: EAP, Scores: EAP=5.0, HPL=1.67, MWS=5.0
ID: id22505, Predicted: EAP, Scores: EAP=5.0, HPL=5.0, MWS=5.0
ID: id24002, Predicted: EAP, Scores: EAP=5.0, HPL=5.0, MWS=1.67
ID: id18982, Predicted: HPL, Scores: EAP=1.67, HPL=5.0, MWS=5.0
ID: id15181, Predicted: EAP, Scores: EAP=5.0, HPL=5.0, MWS=5.0
ID: id21888, Predicted: UNCLEAR, Scores: EAP=1.67, HPL=1.67, M


