In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import re

# Load the train and test datasets
train_df = pd.read_csv('/path/to/train.csv')
test_df = pd.read_csv('/path/to/test.csv')

# Basic text preprocessing function
def basic_preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs, mentions, and hashtags
    text = re.sub(r"http\S+|www\S+|https\S+|@\S+|#\S+", '', text, flags=re.MULTILINE)
    # Remove special characters and numbers
    text = re.sub(r'\W|\d', ' ', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply preprocessing to the text column in the train and test datasets
train_df['cleaned_text'] = train_df['text'].apply(basic_preprocess_text)
test_df['cleaned_text'] = test_df['text'].apply(basic_preprocess_text)

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['cleaned_text'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['cleaned_text'])

# Separate features and target variable in the training set
X = X_train_tfidf
y = train_df['target']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Logistic Regression model with best parameters
log_reg = LogisticRegression(C=1, solver='liblinear', max_iter=1000)
log_reg.fit(X_train, y_train)

# Predict on the validation set and calculate F1 score
y_val_pred = log_reg.predict(X_val)
f1 = f1_score(y_val, y_val_pred)
print("Validation F1 Score:", f1)

# Generate predictions on the test set
test_predictions = log_reg.predict(X_test_tfidf)

# Prepare the submission DataFrame
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'target': test_predictions
})

# Save the submission file
submission_df.to_csv('/path/to/logistic_regression_submission.csv', index=False)
