#text and numerical combined

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')



from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd


# Load the data
data = pd.read_csv("/content/drive/MyDrive/IIT Work/Difficulty tagger/java_code_embeddings.csv")

data.head()


# Text preprocessing
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Remove punctuation
    tokens = [token.lower() for token in tokens if token not in string.punctuation]
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)



data['Processed Body'] = data['Processed Body'].apply(preprocess_text)
data['Title'] = data['Title'].apply(preprocess_text)


# Define text and numerical features
text_features = ['Title',	'Processed Body',	'Tags',	'Code Final'	]  # Replace with the names of your text columns
numerical_features = ['LOC', 'Question_Length', 'Total count urls and imgs', 'View Count', 'Answer Count', 'Score', 'User_Reputation', 'Bronze_badge', 'Gold_badge', 'Silver_badge', 'Accept_rate', 'Interval from first', 'Interval from accepted']

# Preprocessing pipelines for text and numerical data
text_pipeline = ColumnTransformer([
    ('tfidf', TfidfVectorizer(), 'Title'),
    ('tfidf2', TfidfVectorizer(), 'Processed Body'),
    ('tfidf3', TfidfVectorizer(), 'Tags'),
    ('tfidf4', TfidfVectorizer(), 'Code Final'),
], remainder='passthrough')

numerical_pipeline = ColumnTransformer([
    ('imputer', SimpleImputer(strategy='mean'), numerical_features),
    ('scaler', StandardScaler(), numerical_features),
])

# Combine the processed text and numerical features
combined_features = FeatureUnion([
    ('text', text_pipeline),
    ('numerical', numerical_pipeline),
])

# Split the data into features and target variable
X = data[text_features + numerical_features]
y = data['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess the data
X_train_processed = combined_features.fit_transform(X_train)
X_test_processed = combined_features.transform(X_test)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_train_processed = imputer.fit_transform(X_train_processed)
X_test_processed = imputer.transform(X_test_processed)

# Train the classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_processed, y_train)

# Predict the labels for test set
y_pred = rf_classifier.predict(X_test_processed)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Function to get user input and make prediction
def predict_label():
    input_data = {}
    for feature in text_features + numerical_features:
        input_data[feature] = input(f"Enter value for '{feature}': ")

    # Preprocess the user input
    input_df = pd.DataFrame([input_data])
    input_processed = combined_features.transform(input_df)
    input_processed = imputer.transform(input_processed)

    # Make prediction
    prediction = rf_classifier.predict(input_processed)
    print("Predicted Label:", prediction[0])

# Make predictions based on user input
predict_label()


In [None]:
import pickle

# Save the trained model to a file
with open('rf_classifier_model.pkl', 'wb') as f:
    pickle.dump(rf_classifier, f)


In [None]:
# Load the trained model from the pickle file
with open('rf_classifier_model.pkl', 'rb') as f:
    rf_classifier_loaded = pickle.load(f)

# Function to get user input and make prediction using the loaded model
def predict_label_loaded():
    input_data = {}
    for feature in text_features + numerical_features:
        input_data[feature] = input(f"Enter value for '{feature}': ")

    # Preprocess the user input
    input_df = pd.DataFrame([input_data])
    input_processed = combined_features.transform(input_df)
    input_processed = imputer.transform(input_processed)

    # Make prediction using the loaded model
    prediction = rf_classifier_loaded.predict(input_processed)
    print("Predicted Label:", prediction[0])

# Make predictions based on user input using the loaded model
predict_label_loaded()


#Only for text

In [None]:
import pickle

# Assuming 'final_text' is your dataset and 'train_data' is defined

# Split the data into training and testing sets
train_data, test_data = train_test_split(final_text, test_size=0.2, random_state=42)

# Extract features using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train = tfidf_vectorizer.fit_transform(train_data['text_concatenated'])
X_test = tfidf_vectorizer.transform(test_data['text_concatenated'])

# Convert labels to numerical values
label_map = {label: idx for idx, label in enumerate(train_data['Label'].unique())}
train_data['Label'] = train_data['Label'].map(label_map)
test_data['Label'] = test_data['Label'].map(label_map)

# Train a Random Forest classifier
classifier = RandomForestClassifier()
classifier.fit(X_train, train_data['Label'])

def predict_label(input_text):
    # Preprocess input text
    input_features = tfidf_vectorizer.transform([input_text])
    # Make prediction
    prediction = classifier.predict(input_features)
    # Map prediction back to original label
    label_map_inverse = {idx: label for label, idx in label_map.items()}
    predicted_label = label_map_inverse[prediction[0]]
    return predicted_label

# Save model and vectorizer to pickle file
with open('model.pkl', 'wb') as model_file:
    pickle.dump((classifier, tfidf_vectorizer, label_map), model_file)

# Example usage:
user_input = input("Enter some text to predict its label: ")
predicted_label = predict_label(user_input)
print("Predicted label:", predicted_label)


In [None]:
import pickle

# Load the model, vectorizer, and label mapping from the pickle file
with open('model.pkl', 'rb') as model_file:
    classifier, tfidf_vectorizer, label_map = pickle.load(model_file)

def predict_label(input_text):
    # Preprocess input text
    input_features = tfidf_vectorizer.transform([input_text])
    # Make prediction
    prediction = classifier.predict(input_features)
    # Map prediction back to original label
    label_map_inverse = {idx: label for label, idx in label_map.items()}
    predicted_label = label_map_inverse[prediction[0]]
    return predicted_label

# Example usage:
user_input = input("Enter some text to predict its label: ")
predicted_label = predict_label(user_input)
print("Predicted label:", predicted_label)
