In [2]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import re
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [3]:
train_df = pd.read_csv(r"D:\elggak\kaggle\Tweet Disaster Competition\nlp-getting-started\train.csv")
test_df = pd.read_csv(r"D:\elggak\kaggle\Tweet Disaster Competition\nlp-getting-started\test.csv")

In [None]:
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(word):
    """Map POS tag to the first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def preprocess_text_lemma(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words]
    return ' '.join(words)




In [7]:
import spacy

# Load the English model in spaCy
nlp = spacy.load("en_core_web_sm")

def preprocess_text_lemma_spacy(text):
    # Process the text using spaCy
    doc = nlp(text.lower())  # Lowercase the text for consistency
    # Lemmatize each token and remove punctuation/special characters
    lemmatized_words = [token.lemma_ for token in doc if not token.is_punct and not token.is_space]
    return ' '.join(lemmatized_words)



the striped bat be hang on their foot for good


In [None]:
stemmer = PorterStemmer()
def stemming(text):
    words = word_tokenize(text)
    return ' '.join([stemmer.stem(word) for word in words])

In [None]:
date_pattern = r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|\d{4}[-/]\d{1,2}[-/]\d{1,2}|(?:January|February|March|April|May|June|July|August|September|October|November|December)\s\d{1,2},?\s\d{4})\b'
time_pattern = r'\b((0?[1-9]|1[0-2]):[0-5]\d\s?(AM|PM)|([01]\d|2[0-3]):[0-5]\d(:[0-5]\d)?)\b'

def preprocess_text(text):
    text = re.sub(r'bin laden', 'Binladen', text, flags=re.IGNORECASE)
    text = re.sub(r"http\S+|www\S+|https\S+", 'http', text, flags=re.MULTILINE)  
    #text = re.sub(r'\@\w+|\#','', text)  
    text = re.sub(r'#(\w+)', r'\1', text)
    text = re.sub(r'[^\w\s]', '', text) 
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\b(?<!breaking)news\b|\b(?<!breaking)\w*news\w*\b', 'news', text)
    return text

# train_df['text'] = train_df['location'].fillna('') + ' ' + train_df['text'].fillna('')
# test_df['text'] = test_df['location'].fillna('') + ' ' + test_df['text'].fillna('')
# train_df['text'] = train_df['keyword'].fillna('') + ' ' + train_df['text'].fillna('')
# test_df['text'] = test_df['keyword'].fillna('') + ' ' + test_df['text'].fillna('')
train_df['text'] = train_df['text'].apply(lambda x: re.sub(date_pattern, 'DATETIME', x))
train_df['text'] = train_df['text'].apply(lambda x: re.sub(time_pattern, 'DATETIME', x))
test_df['text'] = test_df['text'].apply(lambda x: re.sub(date_pattern, 'DATETIME', x))
test_df['text'] = test_df['text'].apply(lambda x: re.sub(time_pattern, 'DATETIME', x))
train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)
train_df['text'] = train_df['text'].apply(preprocess_text_lemma_spacy)
test_df['text'] = test_df['text'].apply(preprocess_text_lemma_spacy)
train_df['text'] = train_df['text'].apply(stemming)
test_df['text'] = test_df['text'].apply(stemming)

# train_df['url'] = train_df['text'].str.contains(r'http|https', regex=True)
# test_df['url'] = test_df['text'].str.contains(r'http|https', regex=True)
# train_df['contains_country'] = train_df['text'].str.contains(r'\b(israel|afghan|iran|iraq|lebanon|yemen|palestine)\b', regex=True, case=False)
# test_df['contains_country'] = train_df['text'].str.contains(r'\b(israel|afghan|iran|iraq|lebanon|yemen|palestine)\b', regex=True, case=False)

# train_df['isNews'] = train_df['text'].str.contains(r'news|News|Breakingnews|BreakingNews|breakingnews', regex=True)
# test_df['isNews'] = test_df['text'].str.contains(r'news|News|Breakingnews|BreakingNews|breakingnews', regex=True)
