In [10]:

import pandas as pd
from fuzzywuzzy import fuzz
from collections import defaultdict

# File paths
file_path = '../Datasets/sentiment_dataset/unlabeled/training.1600000.processed.noemoticon.csv'
output_file_path = '../Datasets/sentiment_dataset/unlabeled/filtered_tweets_dove_sprint_spirit_delta.xlsx'

# Keywords and similarity threshold
keywords = ['dove', 'sprint', 'spirit', 'delta']
similarity_threshold = 85

In [11]:
# Load and clean the data
data = pd.read_csv(file_path, header=None, encoding='latin-1')
data = data.drop([1, 2, 3], axis=1)
keyword_counts = defaultdict(int)
data.columns = ['keyword', 'label', 'tweet']
data['label'] = 0
data['keyword'] = 0
data

Unnamed: 0,keyword,label,tweet
0,0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,0,is upset that he can't update his Facebook by ...
2,0,0,@Kenichan I dived many times for the ball. Man...
3,0,0,my whole body feels itchy and like its on fire
4,0,0,"@nationwideclass no, it's not behaving at all...."
...,...,...,...
1599995,0,0,Just woke up. Having no school is the best fee...
1599996,0,0,TheWDB.com - Very cool to hear old Walt interv...
1599997,0,0,Are you ready for your MoJo Makeover? Ask me f...
1599998,0,0,Happy 38th Birthday to my boo of alll time!!! ...


In [12]:
# Function to check for keywords
def contains_keyword(text):
    text_lower = text.lower()
    for keyword in keywords:
        if keyword.lower() in text_lower:
            keyword_counts[keyword] += 1
            return keyword
        elif fuzz.partial_ratio(keyword.lower(), text_lower) >= similarity_threshold:
            keyword_counts[keyword] += 1
            return keyword
    return None

# Function to exclude tweets with mentions containing keywords
def mentions_contain_keywords(tweet):
    words = tweet.split()
    for word in words:
        if word.startswith('@'):
            potential_keyword = word[1:].lower()
            if contains_keyword(potential_keyword) is not None:
                return True
    return False

In [13]:
# Filter tweets
filtered_data = data[~data['tweet'].apply(mentions_contain_keywords)]

In [14]:
filtered_data = filtered_data[filtered_data['tweet'].apply(lambda x: contains_keyword(x) is not None)]
filtered_data['keyword'] = filtered_data['tweet'].apply(contains_keyword)

# sort by keyword and reset index
filtered_data = filtered_data.sort_values(by='keyword')
filtered_data = filtered_data.reset_index(drop=True)
filtered_data

Unnamed: 0,keyword,label,tweet
0,delta,0,@katepatto it is an overload do you reckon it...
1,delta,0,miï¿½rcoles que viene #dï¿½adeltap! baila @jaz...
2,delta,0,@dznews i am so upset at dartmouth for picking...
3,delta,0,@PRsarahevans well besides the fact that you'r...
4,delta,0,OH LOOKIE Delta mag scans kk. um OH yeah. la...
...,...,...,...
1328,sprint,0,@LeMadChef Unfortunately the Palm Pre is with ...
1329,sprint,0,No Goldsprints tonight!!!
1330,sprint,0,No Sprints tonight hopefully we will get some...
1331,sprint,0,has no Internet access on the mountain. Sprint...


In [15]:
# Save to Excel
filtered_data.to_excel(output_file_path, index=False)