# Data Explory 

In [1]:
#import library 
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import re
from sklearn.metrics import classification_report,ConfusionMatrixDisplay,confusion_matrix



In [2]:
# Load the training data and testing data 
train_data = pd.read_csv('/kaggle/input/deeptweets/train.csv')

test_data = pd.read_csv('/kaggle/input/deeptweets/test.csv')

In [3]:
train_data.groupby('Label').describe()

Unnamed: 0_level_0,TweetId,TweetId,TweetId,TweetId,TweetId,TweetId,TweetId,TweetId
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Politics,3200.0,2.940386e+17,1.41401e+16,2.477269e+17,2.885772e+17,3.005219e+17,3.042852e+17,3.068234e+17
Sports,3325.0,2.835879e+17,7.027798e+16,23909310000.0,2.998319e+17,3.043017e+17,3.059188e+17,3.068341e+17


In [4]:
tweets = train_data.copy()
tweets


Unnamed: 0,TweetId,Label,TweetText
0,304271250237304833,Politics,'#SecKerry: The value of the @StateDept and @U...
1,304834304222064640,Politics,'@rraina1481 I fear so'
2,303568995880144898,Sports,'Watch video highlights of the #wwc13 final be...
3,304366580664528896,Sports,'RT @chelscanlan: At Nitro Circus at #AlbertPa...
4,296770931098009601,Sports,'@cricketfox Always a good thing. Thanks for t...
...,...,...,...
6520,296675082267410433,Politics,'Photo: PM has laid a wreath at Martyrs Monume...
6521,306677536195231746,Sports,'The secret of the Chennai pitch - crumbling o...
6522,306451295307431937,Sports,@alinabhutto he isn't on Twitter either
6523,306088574221176832,Sports,'Which England player would you take out to di...


In [5]:

# missing values
tweets.isna().sum()

TweetId      0
Label        0
TweetText    0
dtype: int64

# data preprocessing

In [6]:
import re
import string
import emoji

def clean_tweet(tweet):
    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
    
    # Remove HTML tags
    tweet = re.sub(r'<.*?>', '', tweet)
    
    # Remove special characters and punctuation
    tweet = re.sub(r'[^\w\s]', '', tweet)
    
    # Remove extra whitespaces
    tweet = ' '.join(tweet.split())
    
    # Remove mentions and hashtags
    tweet = re.sub(r'@\w+|#\w+', '', tweet)
    
    return tweet



In [7]:
tweets['TweetClean'] = tweets['TweetText'].apply(clean_tweet)

In [8]:
tweets

Unnamed: 0,TweetId,Label,TweetText,TweetClean
0,304271250237304833,Politics,'#SecKerry: The value of the @StateDept and @U...,SecKerry The value of the StateDept and USAID ...
1,304834304222064640,Politics,'@rraina1481 I fear so',rraina1481 I fear so
2,303568995880144898,Sports,'Watch video highlights of the #wwc13 final be...,Watch video highlights of the wwc13 final betw...
3,304366580664528896,Sports,'RT @chelscanlan: At Nitro Circus at #AlbertPa...,RT chelscanlan At Nitro Circus at AlbertPark t...
4,296770931098009601,Sports,'@cricketfox Always a good thing. Thanks for t...,cricketfox Always a good thing Thanks for the ...
...,...,...,...,...
6520,296675082267410433,Politics,'Photo: PM has laid a wreath at Martyrs Monume...,Photo PM has laid a wreath at Martyrs Monument...
6521,306677536195231746,Sports,'The secret of the Chennai pitch - crumbling o...,The secret of the Chennai pitch crumbling on t...
6522,306451295307431937,Sports,@alinabhutto he isn't on Twitter either,alinabhutto he isnt on Twitter either
6523,306088574221176832,Sports,'Which England player would you take out to di...,Which England player would you take out to din...


# Feature extraction

In [9]:
X_train = tweets['TweetText']
y_train = tweets['Label']
# Feature extraction using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Model Training

In [10]:
# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)


# Model Prediction

In [11]:
# Make predictions on the test data
X_test = test_data['TweetText'].apply(clean_tweet)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
y_pred = model.predict(X_test_tfidf)

y_train_pred = model.predict(X_train_tfidf)
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Training Accuracy:", train_accuracy)

Training Accuracy: 0.9437547892720306


# submission

In [12]:
# create submission file
submission_df = pd.DataFrame({'TweetId': test_data['TweetId'], 'Label': y_pred})
submission_df.to_csv('submission_f.csv', index=False)

# 5. Additional Approaches
If more time is available, you can consider the following approaches:
* Experiment with different feature extraction techniques (e.g., word embeddings, BERT embeddings).
* Try different machine learning models (e.g., Random Forest, SVM, deep learning models).
* Perform data augmentation for text data.
* Address class imbalance issues using techniques like SMOTE.
* Explore ensemble methods to combine multiple models for improved performance.


In [13]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.9.2-py3-none-any.whl (19.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.1/19.1 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting databricks-cli<1,>=0.8.7 (from mlflow)
  Downloading databricks_cli-0.18.0-py2.py3-none-any.whl (150 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.3/150.3 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting querystring-parser<2 (from mlflow)
  Downloading querystring_parser-1.2.4-py2.py3-none-any.whl (7.9 kB)
Collecting gunicorn<22 (from mlflow)
  Downloading gunicorn-21.2.0-py3-none-any.whl (80 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.2/80.2 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: querystring-parser, gunicorn, databricks-cli, mlflow
Successfully installed databricks-cli-0.18.0 gunicorn-21.2.0 mlflow-2.9.2 querystring-parser-1.2.4


In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import re
from sklearn.metrics import classification_report, confusion_matrix
import mlflow

# Initialize MLflow
mlflow.start_run()

# Load the training data and testing data
train_data = pd.read_csv('/kaggle/input/deeptweets/train.csv')
test_data = pd.read_csv('/kaggle/input/deeptweets/test.csv')

X_train = train_data['TweetText']
y_train = train_data['Label']

# Feature extraction using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Make predictions on the test data
X_test = test_data['TweetText'].apply(clean_tweet)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
y_pred = model.predict(X_test_tfidf)

# Calculate training accuracy
y_train_pred = model.predict(X_train_tfidf)
train_accuracy = accuracy_score(y_train, y_train_pred)

# Log parameters, metrics, and artifacts to MLflow
mlflow.log_params({'max_features': 1000})
mlflow.log_metrics({'Training Accuracy': train_accuracy})
mlflow.log_artifact('/kaggle/input/deeptweets/train.csv')
mlflow.log_artifact('/kaggle/input/deeptweets/test.csv')

# End MLflow run
mlflow.end_run()


In [15]:
import pandas as pd
test_data = pd.read_csv('/kaggle/input/deeptweets/test.csv')
test_data.to_csv('test.csv')

In [16]:
!pip freeze > requirements.txt