In [2]:
#install this if you do not have this already installed
!pip install gitpython

import git
import sys


# Clone the GitHub repository
repo_url = 'https://github.com/Natural-Language-Processing-YU/Module-1-Assignment.git'
repo_dir = '/content/m1_repo'  # Specify the directory to clone the repository
git.Repo.clone_from(repo_url, repo_dir)
# Add the cloned repository directory to the import path

Collecting gitpython
  Downloading GitPython-3.1.41-py3-none-any.whl (196 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/196.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m194.6/196.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m196.4/196.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gitdb<5,>=4.0.1 (from gitpython)
  Downloading gitdb-4.0.11-py3-none-any.whl (62 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/62.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython)
  Downloading smmap-5.0.1-py3-none-any.whl (24 kB)
Installing collected packages: smmap, gitdb, gitpython
Successfully installed gitdb-4.0.11 gitpython

<git.repo.base.Repo '/content/m1_repo/.git'>

In [3]:
#extract the tweets
import pandas as pd

tweets = pd.read_csv('/content/m1_repo/data/elonmusk_tweets.csv') #import file
print(tweets.dtypes) #print data types
print(tweets.text) #show tweets from file
df = pd.DataFrame(tweets)

id             int64
created_at    object
text          object
dtype: object
0       b'And so the robots spared humanity ... https:...
1       b"@ForIn2020 @waltmossberg @mims @defcon_5 Exa...
2           b'@waltmossberg @mims @defcon_5 Et tu, Walt?'
3                     b'Stormy weather in Shortville ...'
4       b"@DaveLeeBBC @verge Coal is dying due to nat ...
                              ...                        
2814                 b'That was a total non sequitur btw'
2815    b'Great Voltaire quote, arguably better than T...
2816    b'I made the volume on the Model S http://t.co...
2817    b"Went to Iceland on Sat to ride bumper cars o...
2818    b'Please ignore prior tweets, as that was some...
Name: text, Length: 2819, dtype: object


In [8]:
import nltk                                # Python library for NLP
import re                                  # library for regular expression operations
import string                              # for string operations

from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings

nltk.download('stopwords')

def clean_tweet(tweet_text):
    """
    Function to process the text of a tweet: cleaning, tokenizing, removing stopwords, and stemming.
    Args:
    tweet_text: A string containing the tweet's text.

    Returns:
    list: A list of words after processing.
    """
    # Eliminate URLs
    tweet_text = re.sub(r'https?://\S+', '', tweet_text)

    # Strip out hashtag symbols
    tweet_text = re.sub(r'#', '', tweet_text)

    # Remove the leading byte order mark if present
    tweet_text = re.sub(r"^\s*b'", '', tweet_text)

    # Tokenize the tweet text using TweetTokenizer
    tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tokenized_tweet = tweet_tokenizer.tokenize(tweet_text)

    # Filter out stopwords and punctuation
    english_stopwords = stopwords.words('english')
    filtered_tweet = [term for term in tokenized_tweet if term not in english_stopwords and term not in string.punctuation]

    # Apply stemming to the filtered tweet
    stemmer = PorterStemmer()
    stemmed_tweet = [stemmer.stem(word) for word in filtered_tweet]

    return stemmed_tweet

# Load the dataset containing tweets
tweets_df = pd.read_csv('/content/m1_repo/data/elonmusk_tweets.csv')

# Apply preprocessing to each tweet in the dataframe
tweets_df['preprocessed_tweet'] = tweets_df['text'].apply(clean_tweet)

# Display the first few rows of original and preprocessed tweets
print(tweets_df[['text', 'preprocessed_tweet']].head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                text  \
0  b'And so the robots spared humanity ... https:...   
1  b"@ForIn2020 @waltmossberg @mims @defcon_5 Exa...   
2      b'@waltmossberg @mims @defcon_5 Et tu, Walt?'   
3                b'Stormy weather in Shortville ...'   
4  b"@DaveLeeBBC @verge Coal is dying due to nat ...   

                                  preprocessed_tweet  
0                         [robot, spare, human, ...]  
1  [b, exactli, tesla, absurdli, overvalu, base, ...  
2                                     [et, tu, walt]  
3                   [stormi, weather, shortvil, ...]  
4   [b, coal, die, due, nat, ga, frack, basic, dead]  


In [9]:
# Function to compute the Levenshtein distance utilizing dynamic programming
def compute_levenshtein_distance(str1, str2):
    """
    Compute the Levenshtein distance, which is a measure of the difference between two sequences.
    Args:
    str1: First string for comparison.
    str2: Second string for comparison.

    Returns:
    int: The Levenshtein distance between str1 and str2.
    """
    # Initializing the matrix dimensions
    len_str1, len_str2 = len(str1), len(str2)
    distance_matrix = [[0] * (len_str2 + 1) for _ in range(len_str1 + 1)]

    # Populating the matrix with dynamic programming
    for x in range(len_str1 + 1):
        for y in range(len_str2 + 1):
            if x == 0:
                distance_matrix[x][y] = y
            elif y == 0:
                distance_matrix[x][y] = x
            elif str1[x - 1] == str2[y - 1]:
                distance_matrix[x][y] = distance_matrix[x - 1][y - 1]
            else:
                distance_matrix[x][y] = 1 + min(distance_matrix[x - 1][y],    # Deletion
                                                distance_matrix[x][y - 1],    # Insertion
                                                distance_matrix[x - 1][y - 1])  # Substitution

    return distance_matrix[len_str1][len_str2]

# Example usage of the function
first_string = 'stemming'
second_string = 'lemmatization'
distance = compute_levenshtein_distance(first_string, second_string)
print(f"The Levenshtein Distance is: {distance}")


The Levenshtein Distance is: 10
