In [1]:
import joblib
import numpy as np
import pandas as pd
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from joblib import dump, load
from scipy.sparse import hstack
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, classification_report, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample



In [2]:
def load_data(file_path):
    return pd.read_json(file_path, lines=True)

In [3]:
def add_features(df):
    df['length_of_sentence'] = df['text'].apply(len)
    df['lexicon_density'] = df['text'].apply(lambda x: len(set(x)) / len(x) if len(x) > 0 else 0)
    df['type_token_ratio'] = df['text'].apply(lambda tokens: len(set(tokens)) / len(tokens) if len(tokens) > 0 else 0)
    word_counts = Counter()
    df['text'].apply(word_counts.update)
    rare_words = {word for word, freq in word_counts.items() if freq <= 5}
    df['rare_word_count'] = df['text'].apply(lambda tokens: sum(1 for token in tokens if token in rare_words))
    return df, rare_words


In [4]:
def initialize_and_save_tools(df):
    # Assuming 'text_str' for vectorization purposes only, not for scaling
    df['text_str'] = df['text'].apply(lambda tokens: ' '.join(map(str, tokens)))

    # Initialize and fit vectorizers
    count_vectorizer = CountVectorizer(max_features=5000)
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    count_vectorizer.fit(df['text_str'])
    tfidf_vectorizer.fit(df['text_str'])

    # Initialize and fit scaler
    scaler = MinMaxScaler()
    # Ensure the feature set here matches the one used in transformations
    scaler.fit(df[['length_of_sentence', 'lexicon_density', 'rare_word_count', 'type_token_ratio']])

    return scaler, count_vectorizer, tfidf_vectorizer

In [5]:
def preprocess_and_feature_engineer(df, scaler, count_vectorizer, tfidf_vectorizer):
    # Ensure 'id' is not included in any feature processing
    if 'id' in df.columns:
        df_features = df.drop(columns=['id'])
    else:
        df_features = df.copy()  # Work on a copy to avoid modifying the original DataFrame
    
    # Prepare text strings for vectorization
    df_features['text_str'] = df_features['text'].apply(lambda tokens: ' '.join(map(str, tokens)))
    
    # Vectorize text
    count_features = count_vectorizer.transform(df_features['text_str']).toarray()
    tfidf_features = tfidf_vectorizer.transform(df_features['text_str']).toarray()
    
    # Ensure the scaler is applied to the correct columns
    additional_features = scaler.transform(df_features[['length_of_sentence', 'lexicon_density', 'rare_word_count', 'type_token_ratio']])
    
    # Combine all features into a single array
    return np.hstack([count_features, tfidf_features, additional_features])


In [6]:
def add_features_specific(df, rare_words):
    df['length_of_sentence'] = df['text'].apply(len)
    df['type_token_ratio'] = df['text'].apply(lambda tokens: len(set(tokens)) / len(tokens) if len(tokens) > 0 else 0)
    df['lexicon_density'] = df['type_token_ratio']  # or any specific definition
    df['rare_word_count'] = df['text'].apply(lambda tokens: sum(1 for token in tokens if token in rare_words))
    return df

In [8]:
# Load datasets
df_domain1 = load_data('domain1_train_data.json')
df_domain2 = load_data('domain2_train_data.json')
df_test = load_data('test_data.json')
# Combine and initialize tools on training data only
df_train = pd.concat([df_domain1, df_domain2], ignore_index=True)
df_train, rare_words = add_features(df_train)
scaler, count_vectorizer, tfidf_vectorizer = initialize_and_save_tools(df_train)
# Add features to each dataset separately
df_domain1 = add_features_specific(df_domain1, rare_words)
df_domain2 = add_features_specific(df_domain2, rare_words)
df_test = add_features_specific(df_test, rare_words)
# Preprocess each dataset
features_domain1 = preprocess_and_feature_engineer(df_domain1, scaler, count_vectorizer, tfidf_vectorizer)
features_domain2 = preprocess_and_feature_engineer(df_domain2, scaler, count_vectorizer, tfidf_vectorizer)
features_test = preprocess_and_feature_engineer(df_test, scaler, count_vectorizer, tfidf_vectorizer)

In [9]:
df_test = add_features_specific(df_test, rare_words)
features_test = preprocess_and_feature_engineer(df_test, scaler, count_vectorizer, tfidf_vectorizer)


In [12]:
# Loading the model
stacking_cls = load('stacking_classifier.pkl')
stacking_cls2 = load('stacking_classifier2.pkl')
# Assuming features_test is already prepared
test_predictions = stacking_cls.predict(features_test)
test_predictions2 = stacking_cls2.predict(features_test)
# pd.DataFrame(test_predictions, columns=['class']).to_csv('test_predictions.csv', index=False)
pd.DataFrame(test_predictions2, columns=['class2']).to_csv('test_predictions.csv', index=False)
