In [2]:
import pandas as pd
from fuzzywuzzy import fuzz
from collections import defaultdict

def load_and_filter_data(file_path, keywords, similarity_threshold):
    try:
        # Load the data
        data = pd.read_csv(file_path, header=None, encoding='latin-1')
        
        # Initialize keyword counts
        keyword_counts = defaultdict(int)

        # Define the function for exact and similar matches
        def contains_keyword(text):
            text_lower = text.lower()
            for keyword in keywords:
                if keyword.lower() in text_lower:  # Exact match check
                    keyword_counts[keyword] += 1
                    return True
                # Similar match check using fuzz.partial_ratio
                elif fuzz.partial_ratio(keyword.lower(), text_lower) >= similarity_threshold:
                    keyword_counts[keyword] += 1
                    return True
            return False

        # Filter tweets containing the keywords
        filtered_data = data[data[5].apply(contains_keyword)]
        
        print("Keyword counts:", dict(keyword_counts))  # Optionally print the counts of matches
        return filtered_data
    except Exception as e:
        print("Error during data loading or filtering:", str(e))
        return pd.DataFrame()  # Return empty DataFrame in case of error

def save_to_excel(data, output_file_path):
    try:
        # Save the filtered data to an Excel file
        with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:
            data.to_excel(writer, index=False, sheet_name='Filtered Tweets')
    except Exception as e:
        print("Error saving to Excel:", str(e))

# Specify file path and output path
file_path = './Datasets/training.1600000.processed.noemoticon.csv'  # Input CSV file path
output_file_path = 'filtered_tweets.xlsx'  # Desired output Excel file path
keywords = ['apple', 'amazon', 'shell', 'sprint', 'frontier']  # List of keywords to filter by
similarity_threshold = 85  # Set your similarity threshold

# Load the data, filter it, and save to Excel
filtered_data = load_and_filter_data(file_path, keywords, similarity_threshold)



Keyword counts: {'sprint': 324, 'amazon': 598, 'apple': 3737, 'shell': 1225, 'frontier': 41}


#### filtered_tweets.xlsx is a filtered version of the origional dataset with tweets that pertain to the 5 keyword entities ['apple', 'amazon', 'shell', 'sprint', 'frontier']

In [3]:
# save to excel
if not filtered_data.empty:
    save_to_excel(filtered_data, output_file_path)
    print("Data filtered and saved successfully.")
else:
    print("No data to save.")

Data filtered and saved successfully.


#### Cleaning: remove unnessesary columns and divide into sets with one specific entity  

In [6]:
import pandas as pd
from fuzzywuzzy import fuzz

# Define the similarity threshold and keyword
similarity_threshold = 85
keyword = 'apple'

# Function to check for similarity
def is_similar(text, keyword, threshold):
    return fuzz.partial_ratio(keyword.lower(), text.lower()) >= threshold

# Filter for tweets containing 'apple' with similarity threshold
apple_tweets = filtered_data[filtered_data[5].apply(is_similar, keyword=keyword, threshold=similarity_threshold)]

# Remove columns 2 through 5
apple_tweets = apple_tweets.drop(columns=[1, 2, 3, 4])

# Change values in column 1 (0-indexed) to 0
apple_tweets.iloc[:, 0] = 0  # Using .iloc to ensure the correct column is targeted by index

# Save the modified data to an Excel file
output_file_path_apple = 'apple_tweets.xlsx'
with pd.ExcelWriter(output_file_path_apple, engine='openpyxl') as writer:
    apple_tweets.to_excel(writer, index=False, sheet_name='Apple Tweets')

print("Filtered data saved successfully to", output_file_path_apple)


Filtered data saved successfully to apple_tweets.xlsx


#### Divide into equal parts for labeling

In [7]:
# Divide the dataset into parts
first_500 = apple_tweets.iloc[:1000]
second_500 = apple_tweets.iloc[1000:2000]
third_500 = apple_tweets.iloc[2000:3000]

# Function to save each part to an Excel file
def save_part_to_excel(part, filename):
    output_file_path = f'{filename}.xlsx'
    with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:
        part.to_excel(writer, index=False, sheet_name='Tweets')
    print(f"Data saved successfully to {output_file_path}")

# Save each part
save_part_to_excel(first_500, 'apple_tweets_nico')
save_part_to_excel(second_500, 'apple_tweets_devin')
save_part_to_excel(third_500, 'apple_tweets_hemanth')


Data saved successfully to apple_tweets_nico.xlsx
Data saved successfully to apple_tweets_devin.xlsx
Data saved successfully to apple_tweets_hemanth.xlsx
