In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


In [3]:

# Load the dataset

# Define the path to the text file
file_path = 'TRAINING_DATA.txt'

# Initialize lists to store labels and sentences
labels = []
sentences = []

# Read the text file line by line
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # Split each line into label and sentence using tab as the delimiter
        label, sentence = line.strip().split('\t')
        labels.append(int(label))
        sentences.append(sentence)

# Create a DataFrame
data = pd.DataFrame({'label': labels, 'text': sentences})

# Display the first few rows of the DataFrame
print(data.head())


   label                                               text
0      1  Cuando conocí a Janice en 2013 , una familia n...
1      0  Hwang habló en Sur de este año por Southwest M...
2      1  Usted podría pensar Katy Perry y Robert Pattin...
3      1  Cualquiera que haya volado los cielos del crea...
4      1  Bueno , este cantante tendrá un LARGO tiempo p...


In [4]:
data.head()

Unnamed: 0,label,text
0,1,"Cuando conocí a Janice en 2013 , una familia n..."
1,0,Hwang habló en Sur de este año por Southwest M...
2,1,Usted podría pensar Katy Perry y Robert Pattin...
3,1,Cualquiera que haya volado los cielos del crea...
4,1,"Bueno , este cantante tendrá un LARGO tiempo p..."


TEXT PREPROCESSING

In [5]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw')

[nltk_data] Downloading package stopwords to C:\Users\Francesco
[nltk_data]     Corda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Francesco
[nltk_data]     Corda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw to C:\Users\Francesco
[nltk_data]     Corda\AppData\Roaming\nltk_data...
[nltk_data]   Package omw is already up-to-date!


True

In [6]:


def preprocess_text(text):
    # Remove all the special characters
    text = re.sub(r'\W', ' ', text)

    # remove all single characters
    text= re.sub(r'\s+[a-zA-Z]\s+', ' ', text)

    # Remove single characters from the start
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text) 

        
    # Substituting multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)

    # Removing prefixed 'b'
    text = re.sub(r'^b\s+', '', text)
    # Lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'\W', ' ', text)
    # Tokenization
    words = word_tokenize(text)
    # Remove stopwords
    words = [word for word in words if word not in stopwords.words('spanish')]
    return ' '.join(words)

# Apply preprocessing to the dataset
data['clean_blob'] = data['text'].apply(preprocess_text)

print(data['clean_blob'].head())

0    conocí janice 2013 familia necesitaba 600 punt...
1    hwang habló sur año southwest music and media ...
2    usted podría pensar katy perry robert pattinso...
3    cualquiera volado cielos creador escuchado act...
4    bueno cantante largo tiempo sentir aún remordi...
Name: clean_blob, dtype: object


FEATURE EXTRACTION

In [7]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(data['clean_blob'], data['label'], test_size=0.2, random_state=42)

# Define a pipeline combining the TfidfVectorizer and LogisticRegression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

# Define the parameter grid
param_grid = {
    'tfidf__max_features': [1000, 2000, 5000, 10000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],  # You can also test different n-gram ranges
    'clf__C': [0.1, 1, 10]  # Regularization parameter for Logistic Regression
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters and the corresponding score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best parameters:", best_params)
print("Best cross-validation score:", best_score)



Best parameters: {'clf__C': 0.1, 'tfidf__max_features': 1000, 'tfidf__ngram_range': (1, 1)}
Best cross-validation score: 0.48304267909650644


In [11]:
# with countclassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(data['clean_blob'], data['label'], test_size=0.2, random_state=42, stratify=data['label'])

# Define a pipeline combining the CountVectorizer and LogisticRegression
pipeline_CV = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('clf', LogisticRegression(max_iter=1000))
])

# Define the parameter grid for GridSearchCV
param_grid_CV = {
    'vectorizer__max_features': [2000, 5000, 10000],
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'clf__C': [0.1, 1, 10]  # Regularization parameter for Logistic Regression
}

# Initialize GridSearchCV with stratified k-fold cross-validation
grid_search_CV = GridSearchCV(pipeline_CV, param_grid_CV, cv=StratifiedKFold(n_splits=5), scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search_CV.fit(X_train, y_train)

# Get the best parameters and the corresponding score
best_params_CV = grid_search_CV.best_params_
best_score_CV = grid_search_CV.best_score_

print("Best parameters:", best_params_CV)
print("Best cross-validation score:", best_score_CV)



Best parameters: {'clf__C': 0.1, 'vectorizer__max_features': 2000, 'vectorizer__ngram_range': (1, 1)}
Best cross-validation score: 0.4672399987289886


MODEL BUILDING

1. Logistic Regression with TFDF

In [8]:
# Initialize TF-IDF Vectorizer with the optimal number of features
tfidf_vectorizer = TfidfVectorizer(max_features=best_params['tfidf__max_features'], ngram_range=best_params['tfidf__ngram_range'])

# Transform the processed text
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)

# Initialize and train the Logistic Regression model with the best regularization parameter
model = LogisticRegression(C=best_params['clf__C'])
model.fit(X_train_tfidf, y_train)

# Predict on the validation set
y_pred = model.predict(X_val_tfidf)



In [9]:
# Evaluate the model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Accuracy: 0.47315436241610737
Precision: 0.4842498665242926
Recall: 0.496986301369863
F1 Score: 0.4905354245538129


2. Logistic Regression with CountVectorizer

In [12]:
# Predict on the validation set
y_pred = grid_search_CV.predict(X_val)

# Evaluate the model
accuracy_CV = accuracy_score(y_val, y_pred)
precision_CV = precision_score(y_val, y_pred)
recall_CV = recall_score(y_val, y_pred)
f1_CV = f1_score(y_val, y_pred)

print(f'Accuracy: {accuracy_CV}')
print(f'Precision: {precision_CV}')
print(f'Recall: {recall_CV}')
print(f'F1 Score: {f1_CV}')

Accuracy: 0.45805369127516776
Precision: 0.46113989637305697
Recall: 0.49776286353467564
F1 Score: 0.47875201721355565
