In [1]:
import pandas as pd
import re
import string

In [2]:
df = pd.read_csv("data/gpt.csv")
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0,tweets,labels
0,ChatGPT: Optimizing Language Models for Dialog...,neutral
1,"Try talking with ChatGPT, our new AI system wh...",good
2,ChatGPT: Optimizing Language Models for Dialog...,neutral
3,"THRILLED to share that ChatGPT, our new model ...",good
4,"As of 2 minutes ago, @OpenAI released their ne...",bad


## Lowercasing

In [3]:
df["tweets_lower"] = df["tweets"].str.lower()
df["labels_lower"] = df["labels"].str.lower()
df.head()

Unnamed: 0,tweets,labels,tweets_lower,labels_lower
0,ChatGPT: Optimizing Language Models for Dialog...,neutral,chatgpt: optimizing language models for dialog...,neutral
1,"Try talking with ChatGPT, our new AI system wh...",good,"try talking with chatgpt, our new ai system wh...",good
2,ChatGPT: Optimizing Language Models for Dialog...,neutral,chatgpt: optimizing language models for dialog...,neutral
3,"THRILLED to share that ChatGPT, our new model ...",good,"thrilled to share that chatgpt, our new model ...",good
4,"As of 2 minutes ago, @OpenAI released their ne...",bad,"as of 2 minutes ago, @openai released their ne...",bad


In [4]:
df.drop(['tweets', 'labels'], axis=1, inplace=True)

In [5]:
df.rename(columns={"tweets_lower": "tweets", "labels_lower": "labels"}, inplace=True)

In [6]:
df.head()

Unnamed: 0,tweets,labels
0,chatgpt: optimizing language models for dialog...,neutral
1,"try talking with chatgpt, our new ai system wh...",good
2,chatgpt: optimizing language models for dialog...,neutral
3,"thrilled to share that chatgpt, our new model ...",good
4,"as of 2 minutes ago, @openai released their ne...",bad


## Removing punctuations

In [7]:
def remove_punctuation(text):
    """
    Remove punctuation characters from a given text string.

    Parameters:
    text (str): The input text from which punctuation characters should be removed.

    Returns:
    str: A new string with all punctuation characters removed.

    Example:
    >>> remove_punctuation("Hello, World!")
    'Hello World'
    """
    return re.sub(f"[{re.escape(string.punctuation)}]", '', text)

In [8]:
# Apply the function to the 'tweets' column
df['tweets'] = df['tweets'].apply(remove_punctuation)

In [9]:
df.head()

Unnamed: 0,tweets,labels
0,chatgpt optimizing language models for dialogu...,neutral
1,try talking with chatgpt our new ai system whi...,good
2,chatgpt optimizing language models for dialogu...,neutral
3,thrilled to share that chatgpt our new model o...,good
4,as of 2 minutes ago openai released their new ...,bad


## Removing stopwords

In [10]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Image\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
def remove_stopwords(text):
    """
    Remove common English stopwords from a given text.

    This function takes an input text and removes commonly used English stopwords, such as
    'the', 'is', 'and', 'in', etc., from the text.

    Parameters:
    text (str): The input text from which stopwords should be removed.

    Returns:
    str: A new string with stopwords removed.

    Example:
    >>> remove_stopwords("This is an example sentence with some stopwords.")
    'This example sentence stopwords .'
    """
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [12]:
# Apply the function to the 'tweets' column
df['tweets'] = df['tweets'].apply(remove_stopwords)

In [13]:
df.head()

Unnamed: 0,tweets,labels
0,chatgpt optimizing language models dialogue ht...,neutral
1,try talking chatgpt new ai system optimized di...,good
2,chatgpt optimizing language models dialogue ht...,neutral
3,thrilled share chatgpt new model optimized dia...,good
4,2 minutes ago openai released new chatgpt nnan...,bad


## Removal of emojis

In [14]:
def remove_emoji(text):
    """
    Remove emojis and emoticons from a given text.

    This function takes an input text and removes all emojis and emoticons present
    in the text.

    Parameters:
    text (str): The input text from which emojis and emoticons should be removed.

    Returns:
    str: A new string with emojis and emoticons removed.

    Example:
    >>> remove_emoji("I love pizza! 🍕❤️😋")
    'I love pizza! '
    """
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [15]:
# Apply the remove_emoji function to the 'tweets' column
df['tweets'] = df['tweets'].apply(remove_emoji)

In [16]:
df.head()

Unnamed: 0,tweets,labels
0,chatgpt optimizing language models dialogue ht...,neutral
1,try talking chatgpt new ai system optimized di...,good
2,chatgpt optimizing language models dialogue ht...,neutral
3,thrilled share chatgpt new model optimized dia...,good
4,2 minutes ago openai released new chatgpt nnan...,bad


## Removal of URLs

In [17]:
def remove_urls(text):
    """
    Remove URLs (Uniform Resource Locators) from a given text.

    This function takes an input text and removes any URLs present in the text,
    including both HTTP(S) and 'www' URLs.

    Parameters:
    text (str): The input text from which URLs should be removed.

    Returns:
    str: A new string with URLs removed.

    Example:
    >>> remove_urls("Check out my website at https://www.example.com")
    'Check out my website at '
    """
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [18]:
# Apply the remove_urls function to the 'tweets' column
df['tweets'] = df['tweets'].apply(remove_urls)

In [19]:
df.head()

Unnamed: 0,tweets,labels
0,chatgpt optimizing language models dialogue ht...,neutral
1,try talking chatgpt new ai system optimized di...,good
2,chatgpt optimizing language models dialogue ht...,neutral
3,thrilled share chatgpt new model optimized dia...,good
4,2 minutes ago openai released new chatgpt nnan...,bad


## Removal of HTML Tags

In [20]:
def remove_html_tags(text):
    """
    Remove HTML tags from a given text.

    This function takes an input text and removes any HTML tags present in the text,
    leaving only the plain text content.

    Parameters:
    text (str): The input text from which HTML tags should be removed.

    Returns:
    str: A new string with HTML tags removed.

    Example:
    >>> remove_html_tags("<p>This is <b>bold</b> text.</p>")
    'This is bold text.'
    """
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

In [21]:
# Apply the remove_html_tags function to the 'tweets' column
df['tweets'] = df['tweets'].apply(remove_html_tags)

In [22]:
df.head()

Unnamed: 0,tweets,labels
0,chatgpt optimizing language models dialogue ht...,neutral
1,try talking chatgpt new ai system optimized di...,good
2,chatgpt optimizing language models dialogue ht...,neutral
3,thrilled share chatgpt new model optimized dia...,good
4,2 minutes ago openai released new chatgpt nnan...,bad


## Stemming

In [23]:
from nltk.stem import PorterStemmer

In [24]:
def perform_stemming(text):
    """
    Perform stemming on a given text.

    Stemming is the process of reducing words to their base or root form. This function
    uses the Porter Stemmer algorithm to perform stemming on the words in the input text.

    Parameters:
    text (str): The input text on which stemming should be performed.

    Returns:
    str: A new string with words stemmed.

    Example:
    >>> perform_stemming("Walking walks walked")
    'walk walk walk'
    """
    stemmer = PorterStemmer()
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

In [25]:
stem_df = df.copy()
# Apply the stemming function to the 'tweets' column
stem_df['stemmed_tweets'] = stem_df['tweets'].apply(perform_stemming)

In [26]:
stem_df.head()

Unnamed: 0,tweets,labels,stemmed_tweets
0,chatgpt optimizing language models dialogue ht...,neutral,chatgpt optim languag model dialogu httpstcok9...
1,try talking chatgpt new ai system optimized di...,good,tri talk chatgpt new ai system optim dialogu f...
2,chatgpt optimizing language models dialogue ht...,neutral,chatgpt optim languag model dialogu httpstcogl...
3,thrilled share chatgpt new model optimized dia...,good,thrill share chatgpt new model optim dialog pu...
4,2 minutes ago openai released new chatgpt nnan...,bad,2 minut ago openai releas new chatgpt nnand us...


## Lemmatization

In [27]:
from nltk.stem import WordNetLemmatizer

In [28]:
def perform_lemmatization(text):
    """
    Perform lemmatization on a given text.

    Lemmatization is the process of reducing words to their base or dictionary form. This
    function uses the WordNet Lemmatizer to perform lemmatization on the words in the input text.

    Parameters:
    text (str): The input text on which lemmatization should be performed.

    Returns:
    str: A new string with words lemmatized.

    Example:
    >>> perform_lemmatization("Walking walks walked")
    'Walking walk walked'
    """
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

In [29]:
lemma_df = df.copy()
# Apply the lemmatization function to the 'tweets' column
lemma_df['lemmatized_tweets'] = lemma_df['tweets'].apply(perform_lemmatization)

In [30]:
lemma_df.head()

Unnamed: 0,tweets,labels,lemmatized_tweets
0,chatgpt optimizing language models dialogue ht...,neutral,chatgpt optimizing language model dialogue htt...
1,try talking chatgpt new ai system optimized di...,good,try talking chatgpt new ai system optimized di...
2,chatgpt optimizing language models dialogue ht...,neutral,chatgpt optimizing language model dialogue htt...
3,thrilled share chatgpt new model optimized dia...,good,thrilled share chatgpt new model optimized dia...
4,2 minutes ago openai released new chatgpt nnan...,bad,2 minute ago openai released new chatgpt nnand...
