In [None]:
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load the dataset
relative_path = os.path.join('..', 'Data', 'archive (26)', 'Language Detection.csv')
current_dir = os.getcwd()
file_path = os.path.join(current_dir, relative_path)
df = pd.read_csv(file_path)

# Preprocess the text data using TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words='english')
X = tfidf.fit_transform(df['Text'])

# Encode the target labels
y = df['Language']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a parameter grid for Grid Search
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Initialize a Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Perform Grid Search with cross-validation to find the best hyperparameters
grid_search = GridSearchCV(rf, param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Evaluate the model with the best found parameters
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the evaluation results
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

# Save the trained model and TF-IDF vectorizer for future use
joblib.dump(best_rf, 'language_detection_rf_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

# Function to load the saved model and vectorizer
def load_language_detection_model():
    model = joblib.load('language_detection_rf_model.pkl')
    vectorizer = joblib.load('tfidf_vectorizer.pkl')
    return model, vectorizer

# Function to determine if a given text is in English
def is_english(text, model, vectorizer):
    text_features = vectorizer.transform([text])
    prediction = model.predict(text_features)
    return prediction[0] == 'English'

# Example usage of the model
language_model, tfidf_vectorizer = load_language_detection_model()
sample_text = "This is a sample text to check if it is in English."
is_english_text = is_english(sample_text, language_model, tfidf_vectorizer)
print(f"Is the sample text in English? {is_english_text}")


Fitting 5 folds for each of 18 candidates, totalling 90 fits
