In [1]:
# Natural Language Processing Challenge - Week 4
# Classifying Machine vs. Human Translations

In [2]:
# 1. Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [3]:
# Download required resources for NLP
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab') # Download punkt_tab for sentence tokenization

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [5]:
# 2. Load the training dataset
train_df = pd.read_csv("TRAINING_DATA.csv", encoding="utf-8", on_bad_lines="skip")

# Strip column names to remove unwanted spaces
train_df = train_df.rename(columns=lambda x: x.strip())

# Display dataset info
print("Training Dataset Information:")
print(train_df.info())
print("\nSample Data:")
print(train_df.head())

Training Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17877 entries, 0 to 17876
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   label     17877 non-null  int64 
 1   sentence  17877 non-null  object
dtypes: int64(1), object(1)
memory usage: 279.5+ KB
None

Sample Data:
   label                                           sentence
0      1  Cuando conocí a Janice en 2013 , una familia n...
1      0  Hwang habló en Sur de este año por Southwest M...
2      1  Usted podría pensar Katy Perry y Robert Pattin...
3      1  Cualquiera que haya volado los cielos del crea...
4      1  Bueno , este cantante tendrá un LARGO tiempo p...


In [6]:
# 3. Preprocess the text
def preprocess_text(text):
    if pd.isnull(text):
        return ""  # Handle missing values
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('spanish'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    # Join tokens back into a string
    return ' '.join(lemmatized_tokens)

# Apply preprocessing
train_df['processed_text'] = train_df['sentence'].apply(preprocess_text)

In [7]:
# 4. Convert text to numerical format using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train_df['processed_text'])
y = train_df['label']  # Assuming 'label' column contains 0 (Machine) and 1 (Human)

In [8]:
# 5. Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# 6. Train a Random Forest Classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

In [10]:
# 7. Evaluate the model
y_pred = classifier.predict(X_test)
print("Model Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Model Accuracy: 0.28383668903803133
Classification Report:
              precision    recall  f1-score   support

           0       0.26      0.24      0.25      1751
           1       0.31      0.32      0.32      1825

    accuracy                           0.28      3576
   macro avg       0.28      0.28      0.28      3576
weighted avg       0.28      0.28      0.28      3576



In [15]:
# 2. Load the training dataset
train_df = pd.read_csv("TRAINING_DATA.csv", encoding="utf-8", on_bad_lines="skip")

# Strip column names to remove unwanted spaces
train_df = train_df.rename(columns=lambda x: x.strip())

# Display dataset info
print("Training Dataset Information:")
print(train_df.info())
print("\nSample Data:")
print(train_df.head())

# Check class distribution
print("\nClass Distribution:")
print(train_df['label'].value_counts())

Training Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17877 entries, 0 to 17876
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   label     17877 non-null  int64 
 1   sentence  17877 non-null  object
dtypes: int64(1), object(1)
memory usage: 279.5+ KB
None

Sample Data:
   label                                           sentence
0      1  Cuando conocí a Janice en 2013 , una familia n...
1      0  Hwang habló en Sur de este año por Southwest M...
2      1  Usted podría pensar Katy Perry y Robert Pattin...
3      1  Cualquiera que haya volado los cielos del crea...
4      1  Bueno , este cantante tendrá un LARGO tiempo p...

Class Distribution:
label
0    8939
1    8938
Name: count, dtype: int64


In [13]:
# 3. Preprocess the text
def preprocess_text(text):
    if pd.isnull(text):
        return ""  # Handle missing values
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('spanish'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    # Join tokens back into a string
    return ' '.join(lemmatized_tokens)

# Apply preprocessing
train_df['processed_text'] = train_df['sentence'].apply(preprocess_text)

In [14]:
# 4. Convert text to numerical format using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train_df['processed_text'])
y = train_df['label']  # Assuming 'label' column contains 0 (Machine) and 1 (Human)

In [16]:
# 5. Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [18]:
# 6. Train an XGBoost Classifier
from xgboost import XGBClassifier # Import the XGBClassifier class from xgboost

classifier = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
classifier.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [19]:
# 7. Evaluate the model
y_pred = classifier.predict(X_test)
print("Model Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Model Accuracy: 0.4527404921700224
Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.32      0.37      1788
           1       0.46      0.59      0.52      1788

    accuracy                           0.45      3576
   macro avg       0.45      0.45      0.44      3576
weighted avg       0.45      0.45      0.44      3576



In [20]:
# Natural Language Processing Challenge - Week 4
# Classifying Machine vs. Human Translations with FastText

# 1. Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
import unicodedata
import gensim.downloader as api
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Download required resources for NLP
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load FastText Embeddings
fasttext_model = api.load("fasttext-wiki-news-subwords-300")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!




In [21]:
# 2. Load the training dataset
train_df = pd.read_csv("TRAINING_DATA.csv", encoding="utf-8", on_bad_lines="skip")

# Strip column names to remove unwanted spaces
train_df = train_df.rename(columns=lambda x: x.strip())

# Display dataset info
print("Training Dataset Information:")
print(train_df.info())
print("\nSample Data:")
print(train_df.head())

# Check class distribution
print("\nClass Distribution:")
print(train_df['label'].value_counts())

Training Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17877 entries, 0 to 17876
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   label     17877 non-null  int64 
 1   sentence  17877 non-null  object
dtypes: int64(1), object(1)
memory usage: 279.5+ KB
None

Sample Data:
   label                                           sentence
0      1  Cuando conocí a Janice en 2013 , una familia n...
1      0  Hwang habló en Sur de este año por Southwest M...
2      1  Usted podría pensar Katy Perry y Robert Pattin...
3      1  Cualquiera que haya volado los cielos del crea...
4      1  Bueno , este cantante tendrá un LARGO tiempo p...

Class Distribution:
label
0    8939
1    8938
Name: count, dtype: int64


In [22]:
# 3. Preprocess the text
def remove_accents(text):
    return ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')

extra_stopwords = {'traducción', 'traduce', 'humano', 'máquina', 'texto'}
stop_words = set(stopwords.words('spanish')).union(extra_stopwords)
stemmer = SnowballStemmer('spanish')

def preprocess_text(text):
    if pd.isnull(text):
        return ""  # Handle missing values
    text = text.lower()
    text = remove_accents(text)  # Remove accents
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    return ' '.join(stemmed_tokens)

# Apply preprocessing
train_df['processed_text'] = train_df['sentence'].apply(preprocess_text)

In [23]:
# 4. Convert text to numerical format using FastText

def get_embedding(text):
    tokens = word_tokenize(text)
    vectors = [fasttext_model[word] for word in tokens if word in fasttext_model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(300)  # 300 dimensions

X = np.array([get_embedding(text) for text in train_df['processed_text']])
y = train_df['label']  # Assuming 'label' column contains 0 (Machine) and 1 (Human)

In [24]:
# 5. Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [25]:
# 6. Train an XGBoost Classifier
classifier = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
classifier.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [26]:
# 7. Evaluate the model
y_pred = classifier.predict(X_test)
print("Model Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Model Accuracy: 0.33808724832214765
Classification Report:
              precision    recall  f1-score   support

           0       0.34      0.34      0.34      1788
           1       0.34      0.33      0.33      1788

    accuracy                           0.34      3576
   macro avg       0.34      0.34      0.34      3576
weighted avg       0.34      0.34      0.34      3576



In [27]:
# 8. Load the real dataset for classification
real_df = pd.read_csv("REAL_DATA.csv", encoding="utf-8", on_bad_lines="skip")
real_df = real_df.rename(columns=lambda x: x.strip())

In [28]:
# Apply preprocessing
real_df['processed_text'] = real_df['sentence'].apply(preprocess_text)

In [29]:
# Convert text to numerical format
X_real = np.array([get_embedding(text) for text in real_df['processed_text']])

In [30]:
# Predict labels for real dataset
real_df['predicted_label'] = classifier.predict(X_real)

In [31]:
real_df[['sentence', 'predicted_label']].to_csv("CLASSIFIED_REAL_DATA.csv", index=False)
print("Classification for REAL_DATA.csv completed. Results saved in CLASSIFIED_REAL_DATA.csv")

Classification for REAL_DATA.csv completed. Results saved in CLASSIFIED_REAL_DATA.csv


In [32]:
# 9. Function for classifying new text
def classify_new_text(text):
    processed_text = preprocess_text(text)
    vectorized_text = np.array([get_embedding(processed_text)])
    prediction = classifier.predict(vectorized_text)
    return "Human" if prediction[0] == 1 else "Machine"

In [33]:
# Example Usage
example_text = "Este es un ejemplo de oración a clasificar."
print("Prediction:", classify_new_text(example_text))

Prediction: Human
