# Creating Training and Testing Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

In [None]:
gptData = pd.read_csv('/content/drive/MyDrive/datasets/reddit/chatgpt-reddit.csv')

In [None]:
gptDataLabeled = pd.read_csv('/content/drive/MyDrive/datasets/reddit/chatgpt_tweets_labeled.csv')

In [None]:
gptData.head()

Unnamed: 0.1,Unnamed: 0,comment_id,comment_parent_id,comment_body,subreddit
0,0,iztdxuh,t3_zj2aeu,"I've been shocked for days now, I don't need c...",r/ChatGPT
1,1,iztn0q0,t3_zj2aeu,\n\nI am so angry right now. I just wasted my...,r/ChatGPT
2,2,izudrph,t3_zj2aeu,chatgpt karma whoring is here folks! just when...,r/ChatGPT
3,3,iztfhtb,t3_zj2aeu,"Worked on me, ngl.",r/ChatGPT
4,4,izu2as9,t3_zj2aeu,"Certified 10/10, must-see moment. It really di...",r/ChatGPT


In [None]:
len(gptData)

52416

In [None]:
gptDataLabeled.head()

Unnamed: 0.1,Unnamed: 0,tweets,labels
0,0,ChatGPT: Optimizing Language Models for Dialog...,neutral
1,1,"Try talking with ChatGPT, our new AI system wh...",good
2,2,ChatGPT: Optimizing Language Models for Dialog...,neutral
3,3,"THRILLED to share that ChatGPT, our new model ...",good
4,4,"As of 2 minutes ago, @OpenAI released their ne...",bad


In [None]:
len(gptDataLabeled)

219294

In [None]:
gptDataCopy = gptData.copy(deep=True)

In [None]:
gptDataCopy.head()

Unnamed: 0.1,Unnamed: 0,comment_id,comment_parent_id,comment_body,subreddit
0,0,iztdxuh,t3_zj2aeu,"I've been shocked for days now, I don't need c...",r/ChatGPT
1,1,iztn0q0,t3_zj2aeu,\n\nI am so angry right now. I just wasted my...,r/ChatGPT
2,2,izudrph,t3_zj2aeu,chatgpt karma whoring is here folks! just when...,r/ChatGPT
3,3,iztfhtb,t3_zj2aeu,"Worked on me, ngl.",r/ChatGPT
4,4,izu2as9,t3_zj2aeu,"Certified 10/10, must-see moment. It really di...",r/ChatGPT


In [None]:
import pandas as pd
import re

# Dropping empty rows
gptDataCopy.dropna(subset=['comment_body'], inplace=True)

# coping rows with greater than 5 words
gptDataCopy = gptDataCopy[gptDataCopy['comment_body'].apply(lambda x: len(re.findall(r'\w+', x)) >= 5)]

# Filtering rows by "ChatGPT"
gptDataCopy = gptDataCopy[gptDataCopy['comment_body'].str.contains('ChatGPT', case=False)]



In [None]:
len(gptDataCopy)

7323

In [None]:
# Selecting 1000 data randomly from the dataset
gptDataCopy = gptDataCopy.sample(n=7000, random_state=42)

# reseting index of the dataset
gptDataCopy.reset_index(drop=True, inplace=True)

In [None]:
gptDataCopy.head() # contains only chat gpt related Reddit comments

Unnamed: 0.1,Unnamed: 0,comment_id,comment_parent_id,comment_body,subreddit
0,29044,j33gfr2,t3_103u2sf,"This is perfect, because I’ve used ChatGPT to ...",r/ChatGPT
1,12965,j2274k4,t1_j22668n,I wouldn't always trust what chatGPT says abou...,r/ChatGPT
2,2320,j2xclod,t1_j2w9h5c,I heard in an interview that the cost per chat...,r/Futurology
3,28435,j24lxxo,t1_j22jydt,I had it write some rudimentary functions in C...,r/ChatGPT
4,6739,j5kxvmp,t3_10j6grl,ChatGPT is so impressive though. I am using it...,r/technology


In [None]:
len(gptDataCopy)

7000

In [None]:
# drop - Comment_ID, Comment_Parent_ID, Sub-Reddit columns
gptDataCopy.drop(columns=['comment_id'], inplace=True)
gptDataCopy.drop(columns=['comment_parent_id'], inplace=True)
gptDataCopy.drop(columns=['subreddit'], inplace=True)

In [None]:
gptDataCopy.head()

Unnamed: 0.1,Unnamed: 0,comment_body
0,29044,"This is perfect, because I’ve used ChatGPT to ..."
1,12965,I wouldn't always trust what chatGPT says abou...
2,2320,I heard in an interview that the cost per chat...
3,28435,I had it write some rudimentary functions in C...
4,6739,ChatGPT is so impressive though. I am using it...


In [None]:
# renaming columns
gptDataCopy.rename(columns={'comment_body': 'Comment'}, inplace=True)
gptDataCopy.rename(columns={'Unnamed: 0': 'Id'}, inplace=True)

In [None]:
gptDataCopy.head()

Unnamed: 0,Id,Comment
0,29044,"This is perfect, because I’ve used ChatGPT to ..."
1,12965,I wouldn't always trust what chatGPT says abou...
2,2320,I heard in an interview that the cost per chat...
3,28435,I had it write some rudimentary functions in C...
4,6739,ChatGPT is so impressive though. I am using it...


In [None]:
import re

In [None]:
def pre_process(text):
    # Removes links
    text = re.sub('http://\S+|https://\S+', '', text)
    text = re.sub('http[s]?://\S+', '', text)
    text = re.sub(r"http\S+", "", text)

    text = re.sub('&amp', 'and', text)
    text = re.sub('&lt', '<', text)
    text = re.sub('&gt', '>', text)

    text = re.sub(r'[^a-zA-Z0-9\s.]', '', text)

    # Removes new line characters
    text = re.sub('[\r\n]+', ' ', text)

    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'@\w+', lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x.group(0)), text) # Keeping the character trailing @
    text = re.sub(r'#\w+', lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x.group(0)), text) # Keeping the character trailing #

    # Removes multiple space characters
    text = re.sub('\s+',' ', text)

    # Converting to lowercase
    text = text.lower()
    return text

In [None]:
gptDataCopy['Comment'] = gptDataCopy['Comment'].apply(pre_process)

In [None]:
gptDataCopy.head()

Unnamed: 0,Id,Comment
0,29044,this is perfect because ive used chatgpt to wr...
1,12965,i wouldnt always trust what chatgpt says about...
2,2320,i heard in an interview that the cost per chat...
3,28435,i had it write some rudimentary functions in c...
4,6739,chatgpt is so impressive though. i am using it...


In [None]:
gptDataCopy.dropna(subset=['Comment'], inplace=True)

In [None]:
len(gptDataCopy)

7000

In [None]:
gptDataCopy.head()

Unnamed: 0,Id,Comment
0,29044,this is perfect because ive used chatgpt to wr...
1,12965,i wouldnt always trust what chatgpt says about...
2,2320,i heard in an interview that the cost per chat...
3,28435,i had it write some rudimentary functions in c...
4,6739,chatgpt is so impressive though. i am using it...


In [None]:
gptDataCopy.to_csv("/content/drive/MyDrive/datasets/reddit/testRedditData.csv", index=False) # this dataset will be used to predict the output by the trained model.

In [None]:
gptDLCopy = gptDataLabeled.copy(deep=True)

In [None]:
gptDLCopy.head()

Unnamed: 0.1,Unnamed: 0,tweets,labels
0,0,ChatGPT: Optimizing Language Models for Dialog...,neutral
1,1,"Try talking with ChatGPT, our new AI system wh...",good
2,2,ChatGPT: Optimizing Language Models for Dialog...,neutral
3,3,"THRILLED to share that ChatGPT, our new model ...",good
4,4,"As of 2 minutes ago, @OpenAI released their ne...",bad


In [None]:
sentiment_mapping = {
    'good': '1',
    'bad': '2',
    'neutral': '0'
}

# replacing sentiment good with 1, bad with 2 and neutral with 0
gptDLCopy['labels'] = gptDLCopy['labels'].replace(sentiment_mapping)

In [None]:
gptDLCopy.head()

Unnamed: 0.1,Unnamed: 0,tweets,labels
0,0,ChatGPT: Optimizing Language Models for Dialog...,0
1,1,"Try talking with ChatGPT, our new AI system wh...",1
2,2,ChatGPT: Optimizing Language Models for Dialog...,0
3,3,"THRILLED to share that ChatGPT, our new model ...",1
4,4,"As of 2 minutes ago, @OpenAI released their ne...",2


In [None]:
gptDLCopy.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
gptDLCopy['tweets'] = gptDLCopy['tweets'].apply(pre_process)

In [None]:
gptDLCopy.to_csv("/content/drive/MyDrive/datasets/reddit/TrainingTwitterData.csv", index=False)

In [None]:
gptDLCopy.head()

Unnamed: 0,tweets,labels
0,chatgpt optimizing language models for dialogu...,0
1,try talking with chatgpt our new ai system whi...,1
2,chatgpt optimizing language models for dialogu...,0
3,thrilled to share that chatgpt our new model o...,1
4,as of 2 minutes ago openai released their new ...,2
