In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import re
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

In [2]:
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(word):
    """Map POS tag to the first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def preprocess_text_lemma(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words]
    return ' '.join(words)



def remove_special_characters(text):
    return re.sub(r'[^\x00-\x7F]+', '', text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\harish-4072\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\harish-4072\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\harish-4072\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\harish-4072\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\harish-4072\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [3]:
stemmer = PorterStemmer()
def stemming(text):
    words = word_tokenize(text)
    return ' '.join([stemmer.stem(word) for word in words])

In [4]:
train_df = pd.read_csv(r"D:\Kaggle\disaster tweets\nlp-getting-started\train.csv")
test_df = pd.read_csv(r"D:\Kaggle\disaster tweets\nlp-getting-started\test.csv")

In [5]:
date_pattern = r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|\d{4}[-/]\d{1,2}[-/]\d{1,2}|(?:January|February|March|April|May|June|July|August|September|October|November|December)\s\d{1,2},?\s\d{4})\b'
time_pattern = r'\b((0?[1-9]|1[0-2]):[0-5]\d\s?(AM|PM)|([01]\d|2[0-3]):[0-5]\d(:[0-5]\d)?)\b'

def preprocess_text(text):
    text = re.sub(r'bin laden', 'Binladen', text, flags=re.IGNORECASE)
    text = re.sub(r"http\S+|www\S+|https\S+", 'http', text, flags=re.MULTILINE)  
    #text = re.sub(r'\@\w+|\#','', text)  
    text = re.sub(r'#(\w+)', r'\1', text)
    text = re.sub(r'[^\w\s]', '', text) 
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\b(?<!breaking)news\b|\b(?<!breaking)\w*news\w*\b', 'news', text)
    return text

# train_df['text'] = train_df['location'].fillna('') + ' ' + train_df['text'].fillna('')
# test_df['text'] = test_df['location'].fillna('') + ' ' + test_df['text'].fillna('')
# train_df['text'] = train_df['keyword'].fillna('') + ' ' + train_df['text'].fillna('')
# test_df['text'] = test_df['keyword'].fillna('') + ' ' + test_df['text'].fillna('')
train_df['text'] = train_df['text'].apply(lambda x: re.sub(date_pattern, 'DATETIME', x))
train_df['text'] = train_df['text'].apply(lambda x: re.sub(time_pattern, 'DATETIME', x))
test_df['text'] = test_df['text'].apply(lambda x: re.sub(date_pattern, 'DATETIME', x))
test_df['text'] = test_df['text'].apply(lambda x: re.sub(time_pattern, 'DATETIME', x))
train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)
train_df['text'] = train_df['text'].apply(preprocess_text_lemma)
test_df['text'] = test_df['text'].apply(preprocess_text_lemma)
train_df['text'] = train_df['text'].apply(stemming)
test_df['text'] = test_df['text'].apply(stemming)

train_df['url'] = train_df['text'].str.contains(r'http|https', regex=True)
test_df['url'] = test_df['text'].str.contains(r'http|https', regex=True)
train_df['contains_country'] = train_df['text'].str.contains(r'\b(israel|afghan|iran|iraq|lebanon|yemen|palestine)\b', regex=True, case=False)
test_df['contains_country'] = train_df['text'].str.contains(r'\b(israel|afghan|iran|iraq|lebanon|yemen|palestine)\b', regex=True, case=False)

# train_df['isNews'] = train_df['text'].str.contains(r'news|News|Breakingnews|BreakingNews|breakingnews', regex=True)
# test_df['isNews'] = test_df['text'].str.contains(r'news|News|Breakingnews|BreakingNews|breakingnews', regex=True)


  train_df['contains_country'] = train_df['text'].str.contains(r'\b(israel|afghan|iran|iraq|lebanon|yemen|palestine)\b', regex=True, case=False)
  test_df['contains_country'] = train_df['text'].str.contains(r'\b(israel|afghan|iran|iraq|lebanon|yemen|palestine)\b', regex=True, case=False)


In [6]:
train_df.head(), train_df.size,test_df.size, train_df.isnull().sum(),test_df.isnull().sum()

(   id keyword location                                               text  \
 0   1     NaN      NaN  our deed be the reason of thi earthquak may al...   
 1   4     NaN      NaN               forest fire near la rong sask canada   
 2   5     NaN      NaN  all resid ask to shelter in place be be notifi...   
 3   6     NaN      NaN     peopl receiv wildfir evacu order in california   
 4   7     NaN      NaN  just get sent thi photo from rubi alaska a smo...   
 
    target    url  contains_country  
 0       1  False             False  
 1       1  False             False  
 2       1  False             False  
 3       1  False             False  
 4       1  False             False  ,
 53291,
 19578,
 id                     0
 keyword               61
 location            2533
 text                   0
 target                 0
 url                    0
 contains_country       0
 dtype: int64,
 id                     0
 keyword               26
 location            1105
 text     

In [7]:
train_df_id = train_df['id']
test_df_id = test_df['id']
X = train_df[['text','url','contains_country']]
y = train_df['target']
X_test = test_df[['text','url','contains_country']]


In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train['text'])
X_val_vec = vectorizer.transform(X_val['text'])
X_test_vec = vectorizer.transform(X_test['text'])

In [13]:
from scipy.sparse import hstack
X_train_combined = hstack([X_train_vec,X_train[['url','contains_country']].values])
X_val_combined = hstack([X_val_vec, X_val[['url','contains_country']].values])
X_test_combined = hstack([X_test_vec, X_test[['url','contains_country']].values])

In [15]:
nb_model = MultinomialNB()
nb_model.fit(X_train_combined, y_train)
y_pred = nb_model.predict(X_val_combined)
print(f'Accuracy: {accuracy_score(y_val, y_pred)}')
print(classification_report(y_val, y_pred, target_names=['ham', 'spam'],digits = 6))
y_pred = nb_model.predict(X_test_combined)
output_df = pd.DataFrame({
    'id': test_df_id,
    'target': y_pred
})

# Save the DataFrame to a CSV file
output_df.to_csv(r'D:\Kaggle\disaster tweets\nb_normal.csv', index=False)

Accuracy: 0.8069599474720945
              precision    recall  f1-score   support

         ham   0.809829  0.867277  0.837569       874
        spam   0.802385  0.725732  0.762136       649

    accuracy                       0.806960      1523
   macro avg   0.806107  0.796504  0.799852      1523
weighted avg   0.806657  0.806960  0.805425      1523



In [14]:
model = LogisticRegression()
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_val_vec)
print(f'Accuracy: {accuracy_score(y_val, y_pred)}')
print(classification_report(y_val, y_pred, target_names=['ham', 'spam'],digits = 6))
y_pred = model.predict(vectorizer.transform(X_test))
output_df = pd.DataFrame({
    'id': test_df_id,
    'target': y_pred
})

# Save the DataFrame to a CSV file
output_df.to_csv(r'D:\Kaggle\disaster tweets\nb_logisticregression.csv', index=False)

Accuracy: 0.799080761654629
              precision    recall  f1-score   support

         ham   0.797071  0.871854  0.832787       874
        spam   0.802469  0.701079  0.748355       649

    accuracy                       0.799081      1523
   macro avg   0.799770  0.786466  0.790571      1523
weighted avg   0.799371  0.799081  0.796808      1523



In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

nb_model = MultinomialNB(alpha=0.1)
nb_model.fit(X_train_vec, y_train)
y_pred = nb_model.predict(X_val_vec)

print(f'Accuracy: {accuracy_score(y_val, y_pred)}')
print(classification_report(y_val, y_pred, target_names=['ham', 'spam'],digits = 6))
y_pred = nb_model.predict(vectorizer.transform(X_test))
output_df = pd.DataFrame({
    'id': test_df_id,
    'target': y_pred
})

# Save the DataFrame to a CSV file
output_df.to_csv(r'D:\Kaggle\disaster tweets\nb_tfid.csv', index=False)

Accuracy: 0.7925147734734077
              precision    recall  f1-score   support

         ham   0.794304  0.861556  0.826564       874
        spam   0.789565  0.699538  0.741830       649

    accuracy                       0.792515      1523
   macro avg   0.791935  0.780547  0.784197      1523
weighted avg   0.792285  0.792515  0.790456      1523



In [17]:
model = LogisticRegression()
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_val_vec)
print(f'Accuracy: {accuracy_score(y_val, y_pred)}')
print(classification_report(y_val, y_pred, target_names=['ham', 'spam'],digits = 6))
y_pred = model.predict(vectorizer.transform(X_test))
output_df = pd.DataFrame({
    'id': test_df_id,
    'target': y_pred
})

# Save the DataFrame to a CSV file
output_df.to_csv(r'D:\Kaggle\disaster tweets\nb_logisticregression.csv', index=False)

Accuracy: 0.8036769533814839
              precision    recall  f1-score   support

         ham   0.800418  0.876430  0.836701       874
        spam   0.809187  0.705701  0.753909       649

    accuracy                       0.803677      1523
   macro avg   0.804803  0.791066  0.795305      1523
weighted avg   0.804155  0.803677  0.801421      1523



In [None]:
train_df = pd.read_csv(r"D:\Kaggle\disaster tweets\nlp-getting-started\train.csv")
test_df = pd.read_csv(r"D:\Kaggle\disaster tweets\nlp-getting-started\test.csv")

In [None]:
train_df['url'] = train_df['text'].str.contains(r'crushed|Crushed', regex=True)

In [None]:
train_df['url'].value_counts()

In [None]:
filtered_df = train_df[train_df['url']]
filtered_df['target'].value_counts().plot(kind='bar', color=['blue', 'orange'])

plt.title('Distribution of Output for Texts Containing HTTP/HTTPS')
plt.xlabel('Output')
plt.ylabel('Count')
plt.show()

In [None]:
pd.set_option('display.max_colwidth', None)
train_df[train_df['id']==9841]['text']

In [None]:
import re

# Example time strings
times = [
    "14:30",
    "23:59:59",
    "03:45 PM",
    "11:59 AM",
    "7:30",
    "23:60",  # Invalid time
    "00:00"
]

# 24-hour time regex
time_pattern_24 = r'\b([01]\d|2[0-3]):[0-5]\d(:[0-5]\d)?\b'
# 12-hour time regex
time_pattern_12 = r'\b(0?[1-9]|1[0-2]):[0-5]\d(:[0-5]\d)?\s?(AM|PM)\b'

# Check for matches
for time in times:
    if re.match(time_pattern_24, time) or re.match(time_pattern_12, time):
        print(f"Valid time: {time}")
    else:
        print(f"Invalid time: {time}")


In [None]:
import re
import pandas as pd

# Example regex for common date formats
date_pattern = r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|\d{4}[-/]\d{1,2}[-/]\d{1,2}|(?:January|February|March|April|May|June|July|August|September|October|November|December)\s\d{1,2},?\s\d{4})\b'

# Example regex for common time formats
time_pattern = r'\b((0?[1-9]|1[0-2]):[0-5]\d\s?(AM|PM)|([01]\d|2[0-3]):[0-5]\d(:[0-5]\d)?)\b'

# Example DataFrame (replace this with your actual DataFrame)
# train_df = pd.read_csv('path_to_your_tweets_disaster_dataset.csv')

# Example text to replace times and dates
train_df = pd.DataFrame({
    'id': [9833],
    'text': ["The earthquake happened on 15-03-2015 at 04:30 AM Meeting at 03:45 PM on 04/22/2022."]
})

# Replace dates with 'DATE'
train_df['text'] = train_df['text'].apply(lambda x: re.sub(date_pattern, 'DATE', x))

# Replace times with 'TIME'
train_df['text'] = train_df['text'].apply(lambda x: re.sub(time_pattern, 'TIME', x))

# Display the result
print(train_df['text'].iloc[0])

