In [23]:
#importing all the libraries
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score,classification_report
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import lightgbm as lgb

In [16]:
# Load the dataset
data = pd.read_csv("preprocessed-50k.csv")
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

# Split the data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Remove null values and create dataframes
x = train_data.dropna()
X_train = x['text']
y_train = x['source']
X_test = test_data['text']
y_test = test_data['source']

# Initialize TfidfVectorizer for nlp purposes
vectorizer = TfidfVectorizer(max_features=5000)
X_train_transformed = vectorizer.fit_transform(X_train)
X_test_transformed = vectorizer.transform(X_test)

# Encoding the target variable necessary to convert the o/p in numerical form
encoder = LabelEncoder()
y_train_transformed = encoder.fit_transform(y_train)
y_test_transformed = encoder.transform(y_test)

In [17]:
# Split the data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Remove null values and create dataframes
x = train_data.dropna()
X_train = x['text']
y_train = x['source']
X_test = test_data['text']
y_test = test_data['source']

# Initialize TfidfVectorizer for nlp purposes
vectorizer = TfidfVectorizer(max_features=5000)
X_train_transformed = vectorizer.fit_transform(X_train)
X_test_transformed = vectorizer.transform(X_test)

# Encoding the target variable necessary to convert the o/p in numerical form
encoder = LabelEncoder()
y_train_transformed = encoder.fit_transform(y_train)
y_test_transformed = encoder.transform(y_test)

In [18]:
#4. lightgbm is basically a gradient boosting framework just like adaboost or xgboost
lgb_model = lgb.LGBMClassifier(objective='multiclass', num_class=len(encoder.classes_), n_estimators=100, random_state=42)
lgb_model.fit(X_train_transformed, y_train_transformed)
y_pred_lgb = lgb_model.predict(X_test_transformed)
accuracy_lgb = accuracy_score(y_test_transformed, y_pred_lgb)
print("LightGBM Accuracy:", accuracy_lgb)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.674706 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 568331
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 4995
[LightGBM] [Info] Start training from score -0.691748
[LightGBM] [Info] Start training from score -0.694548
LightGBM Accuracy: 0.8268


In [None]:
# Hyperparameter tuning using GridSearchCV
param_grid = {
    'num_leaves': [31, 50, 100],         # Number of leaves in one tree (higher can increase complexity)
    'learning_rate': [0.01, 0.05, 0.1],  # Step size shrinkage to prevent overfitting
    'n_estimators': [100, 200, 300],     # Number of boosting iterations
    'max_depth': [-1, 10, 20],           # Maximum depth of a tree
    'min_child_samples': [20, 50],       # Minimum number of data points in a leaf
    'subsample': [0.7, 0.8, 1.0],        # Subsample ratio of the training data
    'colsample_bytree': [0.7, 0.8, 1.0]  # Subsample ratio of columns (features) when constructing each tree
}

# Apply GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(lgb_model, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train_transformed, y_train_transformed)

# Train the best model
best_lgb_model = grid_search.best_estimator_

# Make predictions
y_pred_lgb = best_lgb_model.predict(X_test_transformed)

# Evaluate the model's accuracy
accuracy_lgb = accuracy_score(y_test_transformed, y_pred_lgb)
print("Best LightGBM Accuracy:", accuracy_lgb)

# Print classification report
print("Classification Report:")
print(classification_report(y_test_transformed, y_pred_lgb, target_names=encoder.classes_))


Fitting 5 folds for each of 1458 candidates, totalling 7290 fits


  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (
  from pandas.core import (
Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.449882 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 492125
[LightGBM] [Info] Number of data points in the train set: 32000, number of used features: 4995
[LightGBM] [Info] Start training from score -0.691711
[LightGBM] [Info] Start training from score -0.694586
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.983589 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 493036
[LightGBM] [Info] Number of data points in the train set: 32000, number of used features: 4994
[LightGBM] [Info] Start training from score -0.691773
[LightGBM] [Info] Start training from score -0.694523
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead

In [21]:
if('__main__'):
    print("\nClassification Report:")
    print(classification_report(y_test_transformed, y_pred_lgb, target_names=encoder.classes_))
    
    # Save the model and necessary components
    print("Saving model and components...")
    joblib.dump(lgb_model, 'lightgbm_model.joblib')
    joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
    joblib.dump(encoder, 'label_encoder.joblib')
    
    # Function to classify new text
    def classify_text(text):
        vectorized_text = vectorizer.transform([text])
        prediction = lgb_model.predict(vectorized_text)
        return encoder.inverse_transform(prediction)[0]
    
    # User interaction loop
    print("\nModel training and evaluation complete. You can now classify text.")
    while True:
        user_input = input("\nEnter the text you want to classify (or 'quit' to exit): ")
    
        if user_input.lower() == 'quit':
            print("Thank you for using the AI Text Classifier. Goodbye!")
            break
    
        classification = classify_text(user_input)
        print(f"\nClassification: {classification}")
    
        # Use the actual label names from your dataset
        if classification == encoder.classes_[0]:  # Assuming 0 index is for human-generated
            print(f"This text appears to be {encoder.classes_[0]}-generated.")
        else:
            print(f"This text appears to be {encoder.classes_[1]}-generated.")


Classification Report:
              precision    recall  f1-score   support

          ai       0.80      0.86      0.83      4972
       human       0.85      0.79      0.82      5028

    accuracy                           0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000

Saving model and components...

Model training and evaluation complete. You can now classify text.



Enter the text you want to classify (or 'quit' to exit):  It looks like your system doesn't recognize the jupyter command, which usually means Jupyter Notebook isn't installed or the PATH environment variable isn't set up correctly



Classification: human
This text appears to be human-generated.



Enter the text you want to classify (or 'quit' to exit):  To assist you with this task, I'll guide you through the process, as I cannot directly interact with external sites like LeetCode. Here's how you can proceed.



Classification: ai
This text appears to be ai-generated.



Enter the text you want to classify (or 'quit' to exit):  Artificial intelligence has revolutionized many industries, but it's also raised concerns about job displacement and ethical implications. As technology advances, finding a balance between innovation and human-centric policies will be essential.



Classification: ai
This text appears to be ai-generated.



Enter the text you want to classify (or 'quit' to exit):  I was on my way to the grocery store when I saw an old friend from high school. We stopped to catch up for a bit, reminiscing about the times we spent studying for exams together. It's funny how time flies and we end up in such different places in life.



Classification: human
This text appears to be human-generated.



Enter the text you want to classify (or 'quit' to exit):  The process of machine learning involves feeding large datasets into algorithms that can learn patterns and make predictions. Over time, the model improves its accuracy as it processes more data, but challenges still remain in terms of interpretability and bias.



Classification: ai
This text appears to be ai-generated.



Enter the text you want to classify (or 'quit' to exit):  When I traveled to Japan, I was struck by how different everything was from my home country. The food was incredible, the people were so polite, and the culture was fascinating. I hope to go back one day and explore even more of the beautiful countryside.



Classification: human
This text appears to be human-generated.



Enter the text you want to classify (or 'quit' to exit):  In a world where digital transformation is inevitable, organizations must adapt to emerging technologies or risk falling behind. The adoption of AI-driven solutions offers unprecedented opportunities for growth, yet companies must also consider cybersecurity risks.



Classification: ai
This text appears to be ai-generated.



Enter the text you want to classify (or 'quit' to exit):  To assist you with this task, I'll guide you through the process, as I cannot directly interact with external sites like LeetCode. Here's how you can proceed.



Classification: ai
This text appears to be ai-generated.



Enter the text you want to classify (or 'quit' to exit):  quit


Thank you for using the AI Text Classifier. Goodbye!
