## End To End Azure NLP Project: Detect AI Generated Text 

### Clean Data

In [2]:
import pandas as pd
import numpy as np

In [1]:
#%pip install datasets

In [3]:
from datasets import load_dataset
# from huggingface_hub import list_datasets
# print(len([dataset.id for dataset in list_datasets()]))

LLM_gen_dataset = load_dataset("perlthoughts/big-brain-4k")
LLM_train_set = LLM_gen_dataset['train']

Downloading readme:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/386M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/250000 [00:00<?, ? examples/s]

In [4]:
human_gen_dataset = load_dataset("qwedsacf/ivypanda-essays")
human_gen_dataset_train = human_gen_dataset['train']

Downloading readme:   0%|          | 0.00/501 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/653M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/128293 [00:00<?, ? examples/s]

#### Convert the Data Into Pandas Dataframes

In [5]:
df_human = pd.DataFrame(human_gen_dataset_train)
df_AI = pd.DataFrame(LLM_train_set)

#### Dropping The Unnecessary Columns

In [6]:
df_human_pcs = df_human.drop(['SOURCE','__index_level_0__'],axis=1)
df_AI_pcs = df_AI.drop(['system','prompt'],axis=1)

#### Removing The Empty Strings  

In [60]:
def rplc_emptystr_w_nan(df_data_pcs):
    df_trns_nan = df_data_pcs.map(lambda x: np.nan if isinstance(x, str) and x.strip() == '' else x)
    # Create a boolean Series where each value indicates if any value in the row is NaN
    bool_series = df_trns_nan.isna().any(axis=1)
    # Use the boolean Series to index the DataFrame
    rows_with_nan = df_trns_nan[bool_series] 
    return rows_with_nan

In [34]:
rplc_emptystr_w_nan(df_human_pcs)

Unnamed: 0,TEXT


In [33]:
# Sample DataFrame
data = {'Column1': [1, 2, np.nan, 4],
        'Column2': [np.nan, 2, 3, 4],
        'Column3': [1, 0.2, 3, 4]}
df = pd.DataFrame(data)
print(df.head()) 
rplc_emptystr_w_nan(df)

   Column1  Column2  Column3
0      1.0      NaN      1.0
1      2.0      2.0      0.2
2      NaN      3.0      3.0
3      4.0      4.0      4.0


Unnamed: 0,Column1,Column2,Column3
0,1.0,,1.0
2,,3.0,3.0


In [39]:
df_AI_pcs

Unnamed: 0,output
0,The review is neutral. The reviewer did not ha...
1,"Okay, let's solve this math problem together! ..."
2,"As an AI, I understand you are asking for a tw..."
3,The sentence is acceptable. It means that the ...
4,The article does not provide the last name of ...
...,...
249995,"First, we find the prime factorization of each..."
249996,The prime numbers in the list are 23 and 29.\n...
249997,The students are advised to eat normal-sized m...
249998,"Jean thought ""David"" was special because he ma..."


In [48]:
df_AI.head()

Unnamed: 0,system,prompt,output
0,You are an AI assistant. Provide a detailed an...,Title: I did not get to see it because I could...,The review is neutral. The reviewer did not ha...
1,"You are a helpful assistant, who always provid...",Solve this math problem\n\nSolve -20*l + 41*l ...,"Okay, let's solve this math problem together! ..."
2,You are an AI assistant. You will be given a t...,Sentiment possibilities Possible answers: 1). ...,"As an AI, I understand you are asking for a tw..."
3,"You are a helpful assistant, who always provid...",Multi-choice problem: Is the next sentence syn...,The sentence is acceptable. It means that the ...
4,You are an AI assistant that follows instructi...,I have a test where I am given the following a...,The article does not provide the last name of ...


In [50]:
print(df_AI["prompt"][15637])

The following article contains an answer for the question: Who steals supplies from other trucks? , can you please find it?   Cooper and Durazno knock out a truck driver and steal his rig. They take it back to a shop where it is repainted and the numbers are filed. In it they find a truckload of carburetors. Cooper abandons Durazno at a gas station and sets out as an independent driver of the yellow Peterbilt. He picks up a hitchhiker but refuses to also give a ride to the man's accompanying woman and dog. At a diner the two notice the Duke of Interstate 40 (Hector Elizondo) eating at another table. Cooper asks him about his rig, which annoys the Duke. Cooper and the hitchhiker watch Samson and Delilah at a drive-in as Cooper discusses professions he's considered as a means to make money and how he reads the almanac so that he can be learning and earning money at the same time. Cooper visits a shopkeeper and attempts to earn money by either selling some of the stolen carburetors or hus

In [35]:
rplc_emptystr_w_nan(df_AI_pcs)

Unnamed: 0,output
4408,
15637,
31616,
33376,
51534,
57974,
87873,
107100,
108134,
123018,


In [57]:
def rplc_emptystr_w_nan_v2(df_data_pcs):
    df_trns_nan = df_data_pcs.map(lambda x: np.nan if isinstance(x, str) and x.strip() == '' else x)
    # Create a boolean Series where each value indicates if any value in the row is NaN
    bool_series = df_trns_nan.isna().any(axis=1)
    # Use the boolean Series to index the DataFrame
    rows_with_nan = df_trns_nan[bool_series]
     
    return df_trns_nan, rows_with_nan


In [71]:
df_AI_wnan, df_rows_nan = rplc_emptystr_w_nan_v2(df_AI_pcs)
df_AI_pcs = df_AI_wnan.dropna().reset_index(drop=True) 

In [78]:
df_AI_pcs

Unnamed: 0,output
0,The review is neutral. The reviewer did not ha...
1,"Okay, let's solve this math problem together! ..."
2,"As an AI, I understand you are asking for a tw..."
3,The sentence is acceptable. It means that the ...
4,The article does not provide the last name of ...
...,...
249970,"First, we find the prime factorization of each..."
249971,The prime numbers in the list are 23 and 29.\n...
249972,The students are advised to eat normal-sized m...
249973,"Jean thought ""David"" was special because he ma..."


#### Remove Duplicates

In [82]:
df_AI_pcs[df_AI_pcs.duplicated()]
# df_AI_pcs["output"][249929]

Unnamed: 0,output
214,No
313,No
450,The review is positive.
542,No.
587,No
...,...
249900,Educational institution.
249902,The writer's purpose of writing the passage is...
249929,"An example of a tweet is: ""Just finished a gre..."
249944,"Yes, this product review is negative."


In [80]:
df_AI_pcs = df_AI_pcs.drop_duplicates().reset_index(drop=True)

Unnamed: 0,output
0,The review is neutral. The reviewer did not ha...
1,"Okay, let's solve this math problem together! ..."
2,"As an AI, I understand you are asking for a tw..."
3,The sentence is acceptable. It means that the ...
4,The article does not provide the last name of ...
...,...
233167,We can convert $\frac{5}{14}$ into a decimal b...
233168,"First, we find the prime factorization of each..."
233169,The students are advised to eat normal-sized m...
233170,"Jean thought ""David"" was special because he ma..."


In [81]:
df_human_pcs[df_human_pcs.duplicated()]

Unnamed: 0,TEXT


#### Remove URLS

In [85]:
import re
example="New competition launched :https://www.kaggle.com/c/nlp-getting-started"
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

remove_URL(example)

'New competition launched :'

In [86]:
df_AI_pcs["output"] = df_AI_pcs["output"].apply(lambda x : remove_URL(x))
df_human_pcs["TEXT"] = df_human_pcs["TEXT"].apply(lambda x : remove_URL(x))

: 

: 

#### Remove Emojis

In [None]:
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

remove_emoji("Omg another Earthquake 😔😔")

In [None]:
df_AI_pcs["output"] = df_AI_pcs["output"].apply(lambda x : remove_emoji(x))
df_human_pcs["TEXT"] = df_human_pcs["TEXT"].apply(lambda x : remove_emoji(x))

#### Remove HTML Tags

In [None]:
example = """<div>
<h1>Real or Fake</h1>
<p>Kaggle </p>
<a href="https://www.kaggle.com/c/nlp-getting-started">getting started</a>
</div>"""
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)
print(remove_html(example))



In [None]:
df_AI_pcs["output"] = df_AI_pcs["output"].apply(lambda x : remove_html(x))
df_human_pcs["TEXT"] = df_human_pcs["TEXT"].apply(lambda x : remove_html(x))

#### Remove Punctuations:

In [None]:
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

example="I am a #king"
print(remove_punct(example))

In [None]:
df_AI_pcs["output"] = df_AI_pcs["output"].apply(lambda x : remove_punct(x))
df_human_pcs["TEXT"] = df_human_pcs["TEXT"].apply(lambda x : remove_punct(x))

#### Spelling Checks:

In [None]:
!pip install pyspellchecker

In [None]:
from spellchecker import SpellChecker

spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)
        
text = "corect me plese"
correct_spellings(text)

In [None]:
#df['text']=df['text'].apply(lambda x : correct_spellings(x)#)