In [1]:
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
#!pip install tensorflow-macos tensorflow-metal


# Set MAC to use Sillicon (MPS - Metal Performance Shaders), GPU/CPU

# Chek with Pytorch
# import torch

# if torch.backends.mps.is_available():
#     device = torch.device("mps")
#     print("MPS available.")
# else:
#     print("MPS not available.")
    
# device = torch.device("mps")

# # Check with Tensorflow
# import tensorflow as tf

# # Check if GPU is used
# print("It's using the GPU?:", tf.config.list_physical_devices('GPU'))

## 1. Data Acquisition and Loading

- Objective: Load the dataset containing the sentences and corresponding labels (0 for machine translation, 1 for human translation).

In [3]:
import pandas as pd

# Load training data
# Load the training and validation datasets with error handling
training_data = pd.read_csv(
    'TRAINING_DATA.txt',
    sep='\t',
    header=None,
    names=["Label", "Text"],
    quoting=3,  # To ignore special quoting issues
)

real_data = pd.read_csv(
    'REAL_DATA.txt',
    sep='\t',
    header=None,
    names=["Text"],
    quoting=3,
)

# Display basic information about the datasets
print("Training Dataset Info:")
print(training_data.info())
print("\nTraining Data Sample:")
print(training_data.head())

print("\nReal Dataset Info:")
print(real_data.info())
print("\nReal Data Sample:")
print(real_data.head())

# Check for missing values in both datasets
print("\nMissing values in training data:", training_data.isnull().sum())
print("Missing values in real data:", real_data.isnull().sum())

print("\nLabel Distribution in Training Data:")
print(training_data['Label'].value_counts())

Training Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17877 entries, 0 to 17876
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   17877 non-null  int64 
 1   Text    17877 non-null  object
dtypes: int64(1), object(1)
memory usage: 279.5+ KB
None

Training Data Sample:
   Label                                               Text
0      1  Cuando conocí a Janice en 2013 , una familia n...
1      0  Hwang habló en Sur de este año por Southwest M...
2      1  Usted podría pensar Katy Perry y Robert Pattin...
3      1  Cualquiera que haya volado los cielos del crea...
4      1  Bueno , este cantante tendrá un LARGO tiempo p...

Real Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 2201 entries, 2 to 2
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    2201 non-null   object
dtypes: object(1)
memory usage: 34.4+ KB
None

Real Data Sa

## 2. Text Extraction and Cleanup

- Objective: Ensure that sentences are properly extracted and cleaned before further processing.

This step involves cleaning the text data to ensure that the sentences are free from noise and unnecessary characters. The main tasks will include:
- Removing special characters, URLs, and digits.
- Converting the text to lowercase.
- Removing extra whitespace.

In [5]:
import re
import nltk
import spacy

# Download Spanish stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords

# Load spaCy's Spanish model for lemmatization
nlp = spacy.load("es_core_news_sm")

# Set of Spanish stopwords
stop_words = set(stopwords.words('spanish'))

def clean_text(text):
    """
    Cleans the input text by performing the following:
    - Lowercasing the text.
    - Removing URLs.
    - Removing special characters, digits, and punctuation.
    - Stripping extra whitespaces.
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove special characters, digits, and punctuation
    text = re.sub(r'[^a-záéíóúñü\s]', '', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def remove_stopwords_and_lemmatize(text):
    """
    Removes stopwords and applies lemmatization using spaCy (better than nltk).
    Tokenizes the final cleaned and processed text.
    """
    # Tokenize and lemmatize using spaCy
    doc = nlp(text)
    
    # Remove stopwords and perform lemmatization
    tokens = [token.lemma_ for token in doc if token.text not in stop_words and not token.is_punct]
    
    # Join tokens into a cleaned sentence
    return ' '.join(tokens)

# Apply the cleaning function
training_data['Cleaned_Text'] = training_data['Text'].apply(clean_text)
real_data['Cleaned_Text'] = real_data['Text'].apply(clean_text)

# Apply stopword removal and lemmatization
training_data['Final_Cleaned_Text'] = training_data['Cleaned_Text'].apply(remove_stopwords_and_lemmatize)
real_data['Final_Cleaned_Text'] = real_data['Cleaned_Text'].apply(remove_stopwords_and_lemmatize)

# Display some cleaned samples
print("\nEnhanced Cleaned Training Data Sample:")
print(training_data[['Text', 'Cleaned_Text', 'Final_Cleaned_Text']].head())

print("\nEnhanced Cleaned Real Data Sample:")
print(real_data[['Text', 'Cleaned_Text', 'Final_Cleaned_Text']].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dastas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Enhanced Cleaned Training Data Sample:
                                                Text  \
0  Cuando conocí a Janice en 2013 , una familia n...   
1  Hwang habló en Sur de este año por Southwest M...   
2  Usted podría pensar Katy Perry y Robert Pattin...   
3  Cualquiera que haya volado los cielos del crea...   
4  Bueno , este cantante tendrá un LARGO tiempo p...   

                                        Cleaned_Text  \
0  cuando conocí a janice en una familia necesita...   
1  hwang habló en sur de este año por southwest m...   
2  usted podría pensar katy perry y robert pattin...   
3  cualquiera que haya volado los cielos del crea...   
4  bueno este cantante tendrá un largo tiempo par...   

                                  Final_Cleaned_Text  
0  conocí janice familia necesitar punto promedio...  
1  hwang hablar sur año southwest music and media...  
2  usted poder pensar katy perry robert pattinson...  
3  cualquiera volar cielo creador escuchar acto p...  
4  bueno ca

## 3. Text Preprocessing

Objective: Transform the raw text into a format that can be fed into a machine learning model.

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer with n-grams (unigrams + bigrams)
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,  # Limit the number of features to speed up computation
    ngram_range=(1, 2),  # Unigrams and bigrams
    stop_words=None,  # We already removed stopwords during preprocessing
    sublinear_tf=True  # Apply sublinear TF scaling (logarithmic transformation)
)

# Fit the TF-IDF vectorizer to the cleaned training data
X_train_tfidf = tfidf_vectorizer.fit_transform(training_data['Final_Cleaned_Text'])
X_real_tfidf = tfidf_vectorizer.transform(real_data['Final_Cleaned_Text'])

# Display the shape of the resulting matrices
print("\nTF-IDF Matrix for Training Data:")
print(f"Shape: {X_train_tfidf.shape}")  # Should be (num_samples, num_features)

print("\nTF-IDF Matrix for Real Data:")
print(f"Shape: {X_real_tfidf.shape}")


TF-IDF Matrix for Training Data:
Shape: (17877, 5000)

TF-IDF Matrix for Real Data:
Shape: (2201, 5000)


## 4. Feature Engineering

Objective: Extract useful features that enhance model performance.

In [9]:
import numpy as np
from scipy.sparse import hstack

def extract_additional_features(texts):
    """
    Extracts additional features to enhance model performance:
    - Sentence length (in characters and words)
    - Average word length
    - Punctuation density
    """
    # Calculate sentence length in characters
    char_lengths = np.array([len(text) for text in texts]).reshape(-1, 1)

    # Calculate sentence length in words
    word_lengths = np.array([len(text.split()) for text in texts]).reshape(-1, 1)

    # Calculate average word length
    avg_word_lengths = np.array([np.mean([len(word) for word in text.split()]) if len(text.split()) > 0 else 0
                                 for text in texts]).reshape(-1, 1)

    # Calculate punctuation density (percentage of punctuation characters)
    punctuation_counts = np.array([sum(1 for char in text if char in ".,!?;:") / len(text)
                                   if len(text) > 0 else 0 for text in texts]).reshape(-1, 1)

    return np.hstack([char_lengths, word_lengths, avg_word_lengths, punctuation_counts])

# Extract features for training and real datasets
additional_features_train = extract_additional_features(training_data['Final_Cleaned_Text'])
additional_features_real = extract_additional_features(real_data['Final_Cleaned_Text'])

# Combine TF-IDF features with additional features
X_train_combined = hstack([X_train_tfidf, additional_features_train])
X_real_combined = hstack([X_real_tfidf, additional_features_real])

print(f"\nCombined Training Features Shape: {X_train_combined.shape}")
print(f"Combined Real Data Features Shape: {X_real_combined.shape}")


Combined Training Features Shape: (17877, 5004)
Combined Real Data Features Shape: (2201, 5004)


## 5. Model Selection and Training

Objective: Train a classifier to distinguish between machine and human translations.

### 5.1 Model Selection Pipeline

In [12]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import numpy as np

# Defined  y_train; it's not necessary if we use training_data['Label'].values
y_train = training_data['Label'].values  # Get the target labels

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, solver='liblinear'),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# Store evaluation results
evaluation_results = {}

# Perform cross-validation for each model using the combined feature set
print("Evaluating models using cross-validation on combined features...\n")
for model_name, model in models.items():
    print(f"Training and evaluating {model_name}...")
    scores = cross_val_score(model, X_train_combined, y_train, cv=5, scoring='accuracy')
    evaluation_results[model_name] = {
        "Mean Accuracy": np.mean(scores),
        "Std Accuracy": np.std(scores)
    }
    print(f"{model_name}: Mean Accuracy = {np.mean(scores):.4f}, Std = {np.std(scores):.4f}\n")

# Display all model performance results
print("\nModel Performance Summary:")
for model_name, result in evaluation_results.items():
    print(f"{model_name}: Mean Accuracy = {result['Mean Accuracy']:.4f}, Std = {result['Std Accuracy']:.4f}")

# Automatically select the best model
best_model_name = max(evaluation_results, key=lambda k: evaluation_results[k]['Mean Accuracy'])
print(f"\nSelected Best Model: {best_model_name} (Mean Accuracy = {evaluation_results[best_model_name]['Mean Accuracy']:.4f})")

Evaluating models using cross-validation on combined features...

Training and evaluating Logistic Regression...
Logistic Regression: Mean Accuracy = 0.4090, Std = 0.0086

Training and evaluating Random Forest...
Random Forest: Mean Accuracy = 0.2700, Std = 0.0074


Model Performance Summary:
Logistic Regression: Mean Accuracy = 0.4090, Std = 0.0086
Random Forest: Mean Accuracy = 0.2700, Std = 0.0074

Selected Best Model: Logistic Regression (Mean Accuracy = 0.4090)


#### Logistic Regression outperforms Random Forest based on mean accuracy, and therefore, it’s been selected as the best traditional model.

## 6. Model Evaluation

Objective: Evaluate the performance of the model on unseen data.

### 6.1 Training Both Logistic Regression and DistilBERT

#### 6.1.1 Train Logistic Regression on the entire dataset

In [17]:
best_logistic_model = LogisticRegression(max_iter=1000, solver='liblinear')
best_logistic_model.fit(X_train_combined, y_train)

# Evaluate Logistic Regression on the training set
from sklearn.metrics import classification_report

logistic_predictions = best_logistic_model.predict(X_train_combined)
print("\nLogistic Regression Classification Report (Training Set):")
print(classification_report(y_train, logistic_predictions))


Logistic Regression Classification Report (Training Set):
              precision    recall  f1-score   support

           0       0.64      0.62      0.63      8939
           1       0.63      0.65      0.64      8938

    accuracy                           0.64     17877
   macro avg       0.64      0.64      0.64     17877
weighted avg       0.64      0.64      0.64     17877



#### 6.1.2 Training DistilBERT

Fine-tune DistilBERT using the same cleaned text data.

In [19]:
# !pip install 'transformers[torch]' accelerate

# !pip install 'accelerate>=0.26.0'

In [20]:
pip show accelerate

Name: accelerate
Version: 1.3.0
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: zach.mueller@huggingface.co
License: Apache
Location: /opt/anaconda3/lib/python3.12/site-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [21]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch

# Custom dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize tokenizer and dataset
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_dataset = TextDataset(
    texts=training_data['Final_Cleaned_Text'].tolist(),
    labels=training_data['Label'].tolist(),
    tokenizer=tokenizer,
    max_len=128
)

# Initialize DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Training setup
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="no"  # Fixed deprecation warning
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

# Train DistilBERT
trainer.train()

# Save DistilBERT for later comparison
model.save_pretrained('./distilbert_finetuned')
tokenizer.save_pretrained('./distilbert_finetuned')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,0.6885
20,0.6975
30,0.6987
40,0.6933
50,0.6998
60,0.6959
70,0.693
80,0.6991
90,0.6992
100,0.6856


('./distilbert_finetuned/tokenizer_config.json',
 './distilbert_finetuned/special_tokens_map.json',
 './distilbert_finetuned/vocab.txt',
 './distilbert_finetuned/added_tokens.json')

### Evaluating the DistilBERT Model After Training

In [23]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
from sklearn.metrics import classification_report, accuracy_score

# Load the trained model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained('./distilbert_finetuned')
tokenizer = DistilBertTokenizer.from_pretrained('./distilbert_finetuned')

# Set model to evaluation mode
model.eval()

# Function to tokenize and prepare the dataset for inference
def tokenize_texts(texts, tokenizer, max_len=128):
    encodings = tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors="pt"
    )
    return encodings

# Tokenize the training dataset (or test set, if available)
encodings = tokenize_texts(training_data['Final_Cleaned_Text'].tolist(), tokenizer)

# Move tensors to the appropriate device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

input_ids = encodings['input_ids'].to(device)
attention_mask = encodings['attention_mask'].to(device)

# Get predictions from the model
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits

# Convert logits to predicted class labels (0 or 1)
predicted_labels = torch.argmax(logits, axis=1).cpu().numpy()

# Evaluate the predictions
true_labels = training_data['Label'].values
print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels))

print(f"Accuracy: {accuracy_score(true_labels, predicted_labels):.4f}")


Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.05      0.10      8939
           1       0.51      0.97      0.67      8938

    accuracy                           0.51     17877
   macro avg       0.58      0.51      0.38     17877
weighted avg       0.58      0.51      0.38     17877

Accuracy: 0.5132


## 7. Post-Modeling and Saving the Model

Objective: Save the trained model and provide a mechanism for inference.

### 7.1 Saving the Logistic Regression Model

In [39]:
import joblib

# Save the trained Logistic Regression model
logistic_model_path = "best_logistic_model.pkl"
joblib.dump(best_logistic_model, logistic_model_path)
print(f"Logistic Regression model saved to {logistic_model_path}.")

Logistic Regression model saved to best_logistic_model.pkl.


### 7.2 Saving the DistilBERT Model

In [42]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Save the DistilBERT model and tokenizer
distilbert_model_path = "./distilbert_finetuned"
model.save_pretrained(distilbert_model_path)
tokenizer.save_pretrained(distilbert_model_path)
print(f"DistilBERT model and tokenizer saved to {distilbert_model_path}.")

DistilBERT model and tokenizer saved to ./distilbert_finetuned.


## 8. Testing on New Data

Objective: Test the saved model on new datasets and classify the sentences.

### 8.1 Inference Function for Logistic Regression

In [51]:
import numpy as np
from scipy.sparse import hstack

# Recompute additional features for the real dataset
def extract_additional_features(texts):
    char_lengths = np.array([len(text) for text in texts]).reshape(-1, 1)
    word_lengths = np.array([len(text.split()) for text in texts]).reshape(-1, 1)
    avg_word_lengths = np.array([np.mean([len(word) for word in text.split()]) if len(text.split()) > 0 else 0
                                 for text in texts]).reshape(-1, 1)
    punctuation_counts = np.array([sum(1 for char in text if char in ".,!?;:") / len(text)
                                   if len(text) > 0 else 0 for text in texts]).reshape(-1, 1)
    return np.hstack([char_lengths, word_lengths, avg_word_lengths, punctuation_counts])

# Extract and combine TF-IDF + additional features
additional_features_real = extract_additional_features(real_data['Final_Cleaned_Text'])
tfidf_features_real = tfidf_vectorizer.transform(real_data['Final_Cleaned_Text'])
combined_features_real = hstack([tfidf_features_real, additional_features_real])

# Predict using the saved Logistic Regression model
logistic_predictions = best_logistic_model.predict(combined_features_real)

# Store predictions
real_data_logistic = real_data.copy()
real_data_logistic['Prediction'] = logistic_predictions
real_data_logistic['Prediction_Label'] = real_data_logistic['Prediction'].apply(lambda x: 'Human' if x == 1 else 'Machine')

# Save predictions to CSV
logistic_output_path = "logistic_regression_predictions.csv"
real_data_logistic[['Text', 'Prediction_Label']].to_csv(logistic_output_path, index=False)
print(f"Logistic Regression predictions saved to {logistic_output_path}.")

Logistic Regression predictions saved to logistic_regression_predictions.csv.


### 8.2 Inference Function for DistilBERT

In [53]:
def infer_distilbert(model_path, tokenizer_path, new_sentences):
    """
    Loads the saved DistilBERT model and performs inference on new sentences.
    """
    # Load the saved model and tokenizer
    model = DistilBertForSequenceClassification.from_pretrained(model_path)
    tokenizer = DistilBertTokenizer.from_pretrained(tokenizer_path)
    
    # Set model to evaluation mode
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    # Tokenize the new sentences
    encodings = tokenizer(
        new_sentences,
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)

    # Get predictions
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, axis=1).cpu().numpy()

    # Return predictions (0 for machine, 1 for human)
    return predictions

# Example: Classify sentences from the real dataset
distilbert_predictions = infer_distilbert("./distilbert_finetuned", "./distilbert_finetuned", real_sentences)

# Display predictions
for i, sentence in enumerate(real_data['Text'].head(10)):
    print(f"Sentence: {sentence}\nPrediction: {'Human' if distilbert_predictions[i] == 1 else 'Machine'}\n")

Sentence: Yo no creo que a nadie le haya encantado un pene flácido .
Prediction: Human

Sentence: No va a resolver sus problemas de crédito o mejorar su relación con su padre .
Prediction: Human

Sentence: Te encantará este !
Prediction: Human

Sentence: Yo estaba a volar a un aeropuerto varias horas de distancia , alquilar un coche , conducir a un lugar remoto en Canadá , y desactivar el teléfono .
Prediction: Human

Sentence: ( Maid En Manhattan , The Wedding Planner , Jersey Girl , Monster In Law , , Gigli , The Back-Up Plan , ¿ Qué esperar cuando se está esperando )
Prediction: Human

Sentence: Mi padre llegó con la primera ola de fuerzas aliadas en el día D , 6 de junio 1944 .
Prediction: Human

Sentence: Y podemos todos estar de acuerdo que los envases no miente ?
Prediction: Human

Sentence: Por supuesto , todos los compañeros de reparto de Casey en Happy Endings estaban allí también , como Elisha Cuthbert y Damon Wayans Jr .
Prediction: Human

Sentence: Al estilo chino capitali

### 8.3 Save Predictions to CSV

In [56]:
import pandas as pd

# Save predictions from the chosen model (DistilBERT example shown)
real_data['Prediction'] = distilbert_predictions
real_data['Prediction_Label'] = real_data['Prediction'].apply(lambda x: 'Human' if x == 1 else 'Machine')

# Save to CSV
real_data[['Text', 'Prediction_Label']].to_csv('classified_real_data.csv', index=False)
print("Predictions saved to classified_real_data.csv.")

Predictions saved to classified_real_data.csv.
