In [133]:
import re
import spacy
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from textblob import TextBlob
from autocorrect import Speller

spell = Speller()

# Load the English model

nlp = spacy.load("en_core_web_md")
data = pd.read_csv("work-data/train.csv")
data.dropna(inplace=True)
data.head()

data["tokens"] = None

In [134]:
def preprocess_text(text):
    # text = spell(text)
    # replace it 's with is
    text = text.replace(r"'s", "is")
    # replace 've with have
    text = text.replace("'ve", "have")
    # replace n't with not
    text = text.replace("n't", "not")
    # replace 're with are
    text = text.replace("'re", "are")
    # replace 'd with would
    text = text.replace("'d", "would")
    # replace 'll with will
    text = text.replace("'ll", "will")
    # replace 'm with am
    text = text.replace("'m", "am")
    text = text.replace("-", " ")

    text = re.sub(r'[^a-zA-Z\s]', '', text)
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "PERSON":  # Detect person names
            text = text.replace(ent.text, "person")
    doc = nlp(text)
    # remove extra spaces
    doc = [token for token in doc if
           not token.is_space and token.is_ascii and token.is_alpha and not token.is_currency and not token.is_digit and not token.is_punct]
    return ' '.join([token.text for token in doc]).lower()
    # return ' '.join([token.lemma_ for token in doc
    #                  if token.lemma_ != '-PRON-']).lower()


In [135]:
preprocess_text("I 'm running to the garden James Madison")

'i am running to the garden person'

In [136]:
def get_sentiment_features(text):
    try:
        blob = TextBlob(text)
        return {
            "polarity": blob.sentiment.polarity,
            "subjectivity": blob.sentiment.subjectivity
        }
    except:
        return {"polarity": 0, "subjectivity": 0}


In [137]:
get_sentiment_features("textblob is amazingly simple to use. What great fun")

{'polarity': 0.3666666666666667, 'subjectivity': 0.4357142857142857}

In [138]:
def get_text_stats(text):
    """Calculate statistical features from text"""
    words = text.split()
    sentiment = get_sentiment_features(text)
    return {
        'text_length': len(text),
        'word_count': len(words),
        'avg_word_length': np.mean([len(word) for word in words]) if words else 0,
        'sentiment_polarity': sentiment['polarity'],
        'sentiment_subjectivity': sentiment['subjectivity'],
        'unique_words': len(set(words)),
        'stopword_count': len([word for word in words if word in nlp.Defaults.stop_words]),
    }

In [139]:
get_text_stats("Textblob is amazingly simple to use. What great fun!")

{'text_length': 52,
 'word_count': 9,
 'avg_word_length': 4.888888888888889,
 'sentiment_polarity': 0.39166666666666666,
 'sentiment_subjectivity': 0.4357142857142857,
 'unique_words': 9,
 'stopword_count': 2}

In [140]:
def get_text_stats_value(X):
    features_list = []
    for text in X:
        stats = get_text_stats(text)
        features = list(stats.values())
        features_list.append(features)
    return np.array(features_list)

In [141]:
get_text_stats_value(["Textblob is amazingly simple to use. What great fun!"])

array([[52.        ,  9.        ,  4.88888889,  0.39166667,  0.43571429,
         9.        ,  2.        ]])

In [142]:
df = pd.read_csv("work-data/train.csv")
print("Starting text preprocessing...")
df['processed_text'] = df['text'].apply(preprocess_text)
print("Text preprocessing completed.")

Starting text preprocessing...
Text preprocessing completed.


In [143]:
text_features = get_text_stats_value(df['processed_text'])

text_features

array([[168.        ,  25.        ,   5.76      , ...,   0.60833333,
         25.        ,  11.        ],
       [151.        ,  21.        ,   6.23809524, ...,   0.39444444,
         20.        ,   6.        ],
       [176.        ,  33.        ,   4.36363636, ...,   0.74285714,
         28.        ,  21.        ],
       ...,
       [ 50.        ,  13.        ,   2.92307692, ...,   0.        ,
         13.        ,   8.        ],
       [205.        ,  32.        ,   5.4375    , ...,   0.        ,
         29.        ,  13.        ],
       [ 48.        ,  10.        ,   3.9       , ...,   0.        ,
         10.        ,   4.        ]])

In [188]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from keras import Sequential
from keras.api.layers import Dense, Input, Dropout, BatchNormalization
from sklearn.metrics import accuracy_score
from keras.api.callbacks import EarlyStopping
from keras.api.optimizers import Adam
from keras.api.regularizers import l2


def train_model(data):
    """Train the model with all features"""
    # Split the data
    X1_train, X1_test, X2_train, X2_test, y_train, y_test = train_test_split(
        data['processed_text'],
        text_features,
        data['label'],
        test_size=0.2,
        random_state=42
    )

    # Initialize the TF-IDF vectorizer
    tfidf = TfidfVectorizer(
        max_features=15000,
        min_df=2,
        ngram_range=(1, 3),
        stop_words='english'
    )

    # Transform the text data to feature vectors
    X1_train_tfidf = tfidf.fit_transform(X1_train).toarray()  # Convert to dense matrix
    X1_test_tfidf = tfidf.transform(X1_test).toarray()  # Use the same TF-IDF fit on test set

    scaler = StandardScaler()

    X1_train_tfidf_scaled = scaler.fit_transform(X1_train_tfidf)
    X1_test_tfidf_scaled = scaler.transform(X1_test_tfidf)

    scaler_features = StandardScaler()
    feature_weight = 1  # Adjust this value to give more weight to additional features

    X2_train_scaled = scaler_features.fit_transform(X2_train) * feature_weight
    X2_test_scaled = scaler_features.transform(X2_test) * feature_weight

    X_train_combined = np.hstack((X1_train_tfidf_scaled, X2_train_scaled))
    X_test_combined = np.hstack((X1_test_tfidf_scaled, X2_test_scaled))


    # Normalize labels to be between 0 and 4
    y_train_scaled = y_train - 1
    y_test_scaled = y_test - 1

    # Build the MLP model

    model = Sequential()
    model.add(Input(shape=(X_train_combined.shape[1],)))
    model.add(Dense(512, activation='relu'))  # Hidden layer
    model.add(Dropout(0.1))  # Dropout
    model.add(Dense(256, activation='relu'))  # Hidden layer
    model.add(Dropout(0.1))  # Dropout
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.1))  # Dropout
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.1))  # Dropout
    model.add(Dense(5, activation='softmax'))

    optimizer = Adam(learning_rate=0.002)

    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    # Fit the model
    print("Training model...")
    model.fit(X_train_combined, y_train_scaled, epochs=100, batch_size=32, callbacks=[early_stopping],
              validation_split=0.2)
    print("Model training completed.")

    # Evaluate the model
    accuracy = model.evaluate(X_test_combined, y_test_scaled)
    print(f"Accuracy: {accuracy[1] * 100:.2f}%")

    # Make predictions
    y_pred = model.predict(X_test_combined)
    y_pred_classes = y_pred.argmax(axis=-1) + 1  # Convert to original label scale (1-5)

    # Calculate accuracy or other metrics if needed
    accuracy = accuracy_score(y_test, y_pred_classes)
    print(f"Test Accuracy: {accuracy * 100:.2f}%")

    return model, accuracy, tfidf


In [189]:
model, metrics, tfidf = train_model(df)

print(metrics)

Training model...
Epoch 1/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.2578 - loss: 1.6397 - val_accuracy: 0.3261 - val_loss: 1.5269
Epoch 2/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.4841 - loss: 1.2665 - val_accuracy: 0.3358 - val_loss: 1.5353
Epoch 3/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8034 - loss: 0.5672 - val_accuracy: 0.3238 - val_loss: 1.9193
Epoch 4/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9351 - loss: 0.2175 - val_accuracy: 0.3396 - val_loss: 2.6961
Model training completed.
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3396 - loss: 1.5199
Accuracy: 32.65%
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Test Accuracy: 32.65%
0.3265060240963855


In [146]:
validation = pd.read_csv("work-data/val.csv")

validation['processed_text'] = validation['text'].apply(preprocess_text)
validation


Unnamed: 0,id,label,text,processed_text
0,6447,5,"At least it 's a fairly impressive debut from the director , Charles Stone III .",at least it is a fairly impressive debut from the director person
1,8503,2,Bland but harmless .,person but harmless
2,2594,5,"Muccino , who directed from his own screenplay , is a canny crowd pleaser , and The Last Kiss ... provides more than enough sentimental catharsis for a satisfying evening at the multiplex .",muccino who directed from his own screenplay is a canny crowd pleaser and the last kiss provides more than enough sentimental catharsis for a satisfying evening at the multiplex
3,6482,2,This mistaken-identity picture is so film-culture referential that the final product is a ghost .,this mistaken identity picture is so film culture referential that the final product is a ghost
4,5685,2,"Laconic and very stilted in its dialogue , this indie flick never found its audience , probably because it 's extremely hard to relate to any of the characters .",person and very stilted in its dialogue this indie flick never found its audience probably because it is extremely hard to relate to any of the characters
...,...,...,...,...
1773,1055,4,"Bravo for history rewritten , and for the uncompromising knowledge that the highest power of all is the power of love .",bravo for history rewritten and for the uncompromising knowledge that the highest power of all is the power of love
1774,4880,2,The movie keeps coming back to the achingly unfunny Phonce and his several silly subplots .,the movie keeps coming back to the achingly unfunny phonce and his several silly subplots
1775,6134,2,Director David Fincher and writer David Koepp ca n't sustain it .,director person and writer person ca not sustain it
1776,615,4,"A rich tale of our times , very well told with an appropriate minimum of means .",a rich tale of our times very well told with an appropriate minimum of means


In [181]:
def predict_val():
    # Transform the text data to feature vectors
    data_tfidf = tfidf.transform(validation['processed_text']).toarray()  # Convert to dense matrix

    scaler = StandardScaler()

    x1_val_scaled = scaler.fit_transform(data_tfidf)

    scaler_features = StandardScaler()
    feature_weight = 1.0  # Adjust this value to give more weight to additional features

    text_features = get_text_stats_value(validation['processed_text'])

    x2_val_scaled = scaler_features.fit_transform(text_features) * feature_weight
    X_val_combined = np.hstack((x1_val_scaled, x2_val_scaled))

    print(X_val_combined.shape)

    return model.predict(X_val_combined)

y_predict = predict_val()

(1778, 8739)
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


In [185]:
# how many labels are same as predicted
validation['predicted_label'] = y_predict.argmax(axis=-1) + 1
correct = validation[validation['label'] == validation['predicted_label']]

len(correct) / len(validation)

0.34195725534308213

In [149]:
def predict_nolabel():
    """Predict no label and output as id and label in a csv"""
    predict_data = pd.read_csv("work-data/test_nolabel.csv")
    predict_data['processed_text'] = predict_data['text'].apply(preprocess_text)

    data_tfidf = tfidf.transform(predict_data['processed_text']).toarray()  # Convert to dense matrix

    scaler = StandardScaler()

    x1_val_scaled = scaler.fit_transform(data_tfidf)

    scaler_features = StandardScaler()
    feature_weight = 1.0  # Adjust this value to give more weight to additional features

    text_features = get_text_stats_value(predict_data['processed_text'])

    x2_val_scaled = scaler_features.fit_transform(text_features) * feature_weight
    X_val_combined = np.hstack((x1_val_scaled, x2_val_scaled))

    print(X_val_combined.shape)

    y_predict = model.predict(X_val_combined)

    predict_data['label'] = y_predict.argmax(axis=-1) + 1

    predict_data[['id', 'label']].to_csv("work-data/test_label.csv", index=False)

predict_nolabel()

(1779, 8739)
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
