# Creating a dataframe with quesion answer pairs

In [1]:
# Import all the Libraries
import numpy as np
import pandas as pd
import html
import re

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score, classification_report

**SubReddits**

The following subreddits were chosen to collect question-answer pairs.

- science
- relationship_advice
- funny
- NoStupidQuestions
- AskReddit
- gaming
- unresolvedmysteries
- wewantplates
- disneyvacation
- talesfromretail
- antiMLM
- IDontWorkHereLady
- nevertellmetheodds
- publicfreakout

In [2]:
SubReddits =    ['science',
                'relationship_advice',
                'funny',
                'NoStupidQuestions',
                'AskReddit',
                'gaming',
                'unresolvedmysteries',
                'wewantplates',
                'disneyvacation',
                'talesfromretail',
                'antiMLM',
                'IDontWorkHereLady',
                'nevertellmetheodds',
                'publicfreakout']

In [4]:
## Read all the SubReddits .csv files 
read_csv = [pd.read_csv(j) for j in ['../data/'+i+'.csv' for i in SubReddits]]

In [5]:
## Concat all dataframe into a single dataframe
data=pd.concat([read_csv[i].drop(columns= ['Unnamed: 0']) for i in range(len(read_csv))])

In [6]:
### Drop nans
data.dropna(inplace= True)

In [7]:
# Reset the index to make it continuous
data.reset_index(drop=True, inplace=True)

In [8]:
data.shape

(6206, 3)

In [9]:
data.head()

Unnamed: 0,question,human_answer,ai_answer
0,Men's shoulder-to-hip ratios influence neuroph...,"Y'all, the point isn't that they confirmed tha...",. Men who have shoulder-to-hip ratios of 0.9 o...
1,Pro-circle arguments for a new futuristic city...,Who would willingly live in a super long priso...,1. This new futuristic city will create an unp...
2,Researchers have successfully transferred a ge...,"For those unfamiliar, tobacco is a plant that ...",".\n\nYes, this is possible. Scientists have us..."
3,Boosting the ‘warm glow’ feeling that people e...,I think there’s missing information in that he...,Warm-glow messaging can be used to encourage r...
4,Social myths on nuclear waste being targeted i...,"Great news. The war against nuclear power, fun...",1. Nuclear waste is too dangerous to store saf...


##### If **Human** return 0,
##### If **AI** return 1

In [10]:
## Create a separate data frame for human answers
data1 = pd.DataFrame({'question':data['question'], 'Answer':data['human_answer'], 'human_ai': 0})

In [11]:
## Create a separate data frame for ai answers
data2 = pd.DataFrame({'question':data['question'], 'Answer':data['ai_answer'], 'human_ai': 1})

In [12]:
## Concat the two dataframes
df = pd.concat([data1,data2])
# Reset the index to make it continuous
df.reset_index(drop=True, inplace=True)

In [17]:
df = df.loc[:,'Answer':]

In [21]:
df.columns = ['text','generated']

In [22]:
df

Unnamed: 0,text,generated
0,"Y'all, the point isn't that they confirmed tha...",0
1,Who would willingly live in a super long priso...,0
2,"For those unfamiliar, tobacco is a plant that ...",0
3,I think there’s missing information in that he...,0
4,"Great news. The war against nuclear power, fun...",0
...,...,...
12407,", but in the end, he was no match for his oppo...",1
12408,"In Argentina, the police will typically appreh...",1
12409,This is a very serious crime and would be inve...,1
12410,We're all breaking down in different ways. Som...,1


In [24]:
# Save the dataframe
df.to_csv('../data/human_ai.csv',index=False)

In [19]:
data1

Unnamed: 0,question,Answer,human_ai
0,Men's shoulder-to-hip ratios influence neuroph...,"Y'all, the point isn't that they confirmed tha...",0
1,Pro-circle arguments for a new futuristic city...,Who would willingly live in a super long priso...,0
2,Researchers have successfully transferred a ge...,"For those unfamiliar, tobacco is a plant that ...",0
3,Boosting the ‘warm glow’ feeling that people e...,I think there’s missing information in that he...,0
4,Social myths on nuclear waste being targeted i...,"Great news. The war against nuclear power, fun...",0
...,...,...,...
6201,I mean he certainly didn't go down without a f...,Lmao. “That hurt” I bet that shit did.,0
6202,"In Argentina they capture a thief, tie him up ...",Lmao. They threw him in there with ease like I...,0
6203,3 guys jump out of a car and rob a lady. (Chic...,That poor woman. What absolute scum those guys...,0
6204,We're all breaking down...,"""The government literally announces that alien...",0


In [20]:
data2

Unnamed: 0,question,Answer,human_ai
0,Men's shoulder-to-hip ratios influence neuroph...,. Men who have shoulder-to-hip ratios of 0.9 o...,1
1,Pro-circle arguments for a new futuristic city...,1. This new futuristic city will create an unp...,1
2,Researchers have successfully transferred a ge...,".\n\nYes, this is possible. Scientists have us...",1
3,Boosting the ‘warm glow’ feeling that people e...,Warm-glow messaging can be used to encourage r...,1
4,Social myths on nuclear waste being targeted i...,1. Nuclear waste is too dangerous to store saf...,1
...,...,...,...
6201,I mean he certainly didn't go down without a f...,", but in the end, he was no match for his oppo...",1
6202,"In Argentina they capture a thief, tie him up ...","In Argentina, the police will typically appreh...",1
6203,3 guys jump out of a car and rob a lady. (Chic...,This is a very serious crime and would be inve...,1
6204,We're all breaking down...,We're all breaking down in different ways. Som...,1


In [22]:
html.unescape(result['ai_answer'][2])

'.\n\nYes, this is possible. Scientists have used genetic engineering techniques to create a variety of plants that are sterile, meaning they lack the ability to produce pollen and viable seeds. These plants have been created in a variety of crops, including tobacco. The genetic modification used to create these plants involves the introduction of a gene that disrupts the plant’s reproductive cycle, preventing it from producing pollen and viable seeds. This technique is commonly used to produce hybrid plants, which have desirable traits but are unable to reproduce.'

In [23]:
re.sub(r'([.!?])\n\n', r'\1 ', result['ai_answer'][2])

'. Yes, this is possible. Scientists have used genetic engineering techniques to create a variety of plants that are sterile, meaning they lack the ability to produce pollen and viable seeds. These plants have been created in a variety of crops, including tobacco. The genetic modification used to create these plants involves the introduction of a gene that disrupts the plant’s reproductive cycle, preventing it from producing pollen and viable seeds. This technique is commonly used to produce hybrid plants, which have desirable traits but are unable to reproduce.'

In [24]:
result['human_answer'][556]

'We wish you the best in life and dont worry man\U0001fae1'

In [27]:
import numpy as np

In [28]:
alphas = np.logspace(-4, 2, 100)

In [29]:
alphas

array([1.00000000e-04, 1.14975700e-04, 1.32194115e-04, 1.51991108e-04,
       1.74752840e-04, 2.00923300e-04, 2.31012970e-04, 2.65608778e-04,
       3.05385551e-04, 3.51119173e-04, 4.03701726e-04, 4.64158883e-04,
       5.33669923e-04, 6.13590727e-04, 7.05480231e-04, 8.11130831e-04,
       9.32603347e-04, 1.07226722e-03, 1.23284674e-03, 1.41747416e-03,
       1.62975083e-03, 1.87381742e-03, 2.15443469e-03, 2.47707636e-03,
       2.84803587e-03, 3.27454916e-03, 3.76493581e-03, 4.32876128e-03,
       4.97702356e-03, 5.72236766e-03, 6.57933225e-03, 7.56463328e-03,
       8.69749003e-03, 1.00000000e-02, 1.14975700e-02, 1.32194115e-02,
       1.51991108e-02, 1.74752840e-02, 2.00923300e-02, 2.31012970e-02,
       2.65608778e-02, 3.05385551e-02, 3.51119173e-02, 4.03701726e-02,
       4.64158883e-02, 5.33669923e-02, 6.13590727e-02, 7.05480231e-02,
       8.11130831e-02, 9.32603347e-02, 1.07226722e-01, 1.23284674e-01,
       1.41747416e-01, 1.62975083e-01, 1.87381742e-01, 2.15443469e-01,
      

In [30]:
1.00000000e+02

100.0