# ChatGPT analysis

## Initialize

In [2]:
import os
from dotenv import load_dotenv

import re
from datetime import date,timedelta
import time
import csv
import os

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import random

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from collections import Counter

import multiprocessing
from multiprocessing import Manager
import importlib 

from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)

from openai import OpenAI

import pickle
from pandas import option_context

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hiron\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hiron\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [6]:
#PARAMETERS

# Source folder to fetch source article files
SOURCE_FOLDER = 'data/'

# Filter prefix used when fetching files
FILE_PREFIX = 'articles_'

# POS filter to only keep relevant POS (part of speech) terms
POS_FILTER = set(['CC','DT','IN','JJ','JJR','JJS','MD','PDT','PRP','PRP$','RB','RBR','RBS','RP','VB','VBD','VBG','VBN','VBP','VBZ','WDT','WP','WRB'])

# Chat GPT prompt used when calling chatGPT for text editting
prompt = "Please edit the following paragraphs to sound more refined for a blog post.\r\n\r\n"

# output file name after calling chatgpt
output_file = 'data/chatgpt.csv'

## ChatGPT call

In [44]:
#Fetch files and content and read into a dataframe
files = os.listdir(None if SOURCE_FOLDER == ''  else SOURCE_FOLDER)

target_sources = []
source_df = None

for file in files:
    if file.startswith(FILE_PREFIX):
        target_sources.append(file)
for source in target_sources:
    if source_df is None:
        source_df = pd.read_csv(SOURCE_FOLDER + source)
    else:
        source_df = pd.concat([source_df,pd.read_csv(SOURCE_FOLDER + source)], axis=0)

source_df = source_df.reset_index()
source_df = source_df.rename(columns={'content': 'content orig'})
source_df['content mod'] = ""


In [None]:
#Filter the source dataframe articles as needed. This is done to reduce # of chatGPT calls
#source_df = source_df.loc[source_df['date'].str.contains('2022-01'),:]


In [None]:
# Actual chatGPT call
%%time

def modify_with_chatgpt(content):
    """Calls chatGPT with the included prompt and the article content 
    """
    content = "Please edit the following paragraphs to sound more refined for a blog post.\r\n\r\n" + content
    completion = client.chat.completions.create(
      model="gpt-3.5-turbo",
      messages=[
        {"role": "user", "content": content}
      ]
    )
    return completion.choices[0].message.content

# For each call, write it to the output file. 
# Note that the output file is set as "append", so accidental data loss won't occur (in exchange of potential duplicates when rerunning the script
with open(output_file, "a",encoding="utf-8") as file:
    writer = csv.writer(file, dialect='excel')
    if (os.path.getsize(output_file) == 0):
        writer.writerow(['index','url','tag','date','content orig', 'content mod'])
    for index,row in source_df.iterrows():
        modified_content = modify_with_chatgpt(row['content orig'])
        row['content mod'] = modified_content
        writer.writerow(row)
    file.close()

## POS tagging

In [7]:
# If reading the chatGPT results from a CSV, uncomment the following line
source_df = pd.read_csv(output_file)

In [9]:
%%time

#workers2.py holds the actual POS tagging script. Creating a separate python file so it can be done as a multi-process
import workers2
importlib.reload(workers2)


articles_stats = pd.DataFrame(columns=['url','sentence count orig','word count orig', 'sentence count mod', 'word count mod'])
word_stats = pd.DataFrame(columns=['word','pos','word count orig', 'word count mod'])



if __name__ == '__main__':
    pool = multiprocessing.Pool()
    word_counter_orig = Counter([("","")])
    word_counter_mod = Counter([("","")])
    results = pool.map(workers2.count_words_2,[(row,POS_FILTER) for index, row in source_df.iterrows()])
    for counter_orig, counter_mod, article_stats in results:
        word_counter_orig += counter_orig
        word_counter_mod += counter_mod
        articles_stats = pd.concat([articles_stats,article_stats],axis=0)
    words = []
    for key, value in word_counter_orig.items():
        words.append([key[0],key[1],value,word_counter_mod[(key[0],key[1])]])

    words_df = pd.DataFrame(words)
    words_df = words_df.rename({0:'word',1:'pos',2:'word count orig',3:'word count mod'},axis=1)
    pool.close()


CPU times: total: 45 s
Wall time: 3min 15s


## Frequency changes in top 1000 words by POS type

In [11]:
# Chagne the following variable to analyze different POS types
# For list of POS tags, see https://stackoverflow.com/questions/15388831/what-are-all-possible-pos-tags-of-nltk
pos_filter = 'JJ'


top_1000_words = words_df
top_1000_words['word count avg'] = (top_1000_words['word count mod'] + top_1000_words['word count orig']) / 2

top_1000_words = top_1000_words.loc[words_df['pos'].str.contains(pos_filter),:].groupby('word')['word count mod'].sum().sort_values(ascending=False).head(1000).index.to_list()
top_1000_words_df = words_df.loc[words_df['word'].isin(top_1000_words) & (words_df['pos'].str.contains(pos_filter)),:]
top_1000_words_df = top_1000_words_df.fillna(0)

In [12]:
top_1000_words_df.head(30)

Unnamed: 0,word,pos,word count orig,word count mod,word count avg
Loading... (need help?),,,,,


In [13]:
#Create basic stats of the articles to be used for normalization

agg_funcs = {'sentence count orig': 'sum', 'sentence count mod' : 'sum','word count orig': 'sum', 'word count mod':'sum', 'url': 'count'}

basic_stats = pd.DataFrame(articles_stats.agg(agg_funcs)).transpose()
basic_stats['word per sentence orig'] = basic_stats['word count orig'] / basic_stats['sentence count orig']
basic_stats['word per sentence mod'] = basic_stats['word count mod'] / basic_stats['sentence count mod']

In [14]:
basic_stats 

sentence count orig,sentence count mod,word count orig,word count mod,url,word per sentence orig,word per sentence mod
Loading... (need help?),,,,,,


In [15]:
#Normalizing the word count to word percentage by looking at number of sentences

top_1000_words_pct_df = top_1000_words_df[['word','word count orig','word count mod']].copy()
top_1000_words_pct_df['word pct orig'] = top_1000_words_pct_df['word count orig'] / basic_stats['url'].to_list()
top_1000_words_pct_df['word pct mod'] = top_1000_words_pct_df['word count mod'] / basic_stats['url'].to_list()



In [16]:
top_1000_words_pct_df

Unnamed: 0,word,word count orig,word count mod,word pct orig,word pct mod
Loading... (need help?),,,,,


In [17]:
# Adding original scoring to make the most intersting changes show on the top. The scoring  [diff of two values] ^2 / [avg of the two values]
top_1000_words_pct_df['score'] = 2 * (top_1000_words_pct_df['word pct mod'] - top_1000_words_pct_df['word pct orig']) * (top_1000_words_pct_df['word pct mod'] - top_1000_words_pct_df['word pct orig']) / (top_1000_words_pct_df['word pct mod'] + top_1000_words_pct_df['word pct orig'])

#Result for easier viewing in a dataframe
def convert_to_percentage(row):
    return '{:+.2f}%'.format((row['word pct mod'] / row['word pct orig'] - 1.0) * 100 )

top_1000_words_pct_df['% relative change'] = top_1000_words_pct_df.apply(convert_to_percentage,axis=1)
top_1000_words_pct_df.loc[top_1000_words_pct_df['% relative change'].str.contains('\+'),:].sort_values(by='score',ascending=False)
#top_1000_words_pct_df.sort_values(by='score',ascending=False)

Unnamed: 0,word,word count orig,word count mod,word pct orig,word pct mod,score,% relative change
Loading... (need help?),,,,,,,


In [20]:
# User friendly output for final output
def format_2f(row):
    return '{:.2f}%'.format((row['word pct mod'] / row['word pct orig'] - 1.0) * 100 )

def style_it(styler):
    styler.hide()
    styler.set_table_styles([{'selector':'th', 'props':[('word-wrap', ' break-word'), ('max-width','25px'), ( 'text-align', 'right') ] }])
    styler.set_table_styles([{'selector': 'caption', 'props': 'font-size:1.25em; font-weight: bold;'}])
    styler.format(precision=3)
    styler.set_properties(**{'font-weight': 'bold'}, subset=['word'])
    return styler
    

presentable_df = top_1000_words_pct_df.copy()
presentable_df['word'] = presentable_df['word'].str[:30]

presentable_df = presentable_df.rename(columns ={'word count orig': 'count before', 'word count mod': 'count after', 'word pct orig': 'per article before',
       'word pct mod': 'per article after', '% relative change': '% change'})
presentable_df = presentable_df.sort_values(by='score',ascending=False)
presentable_df = presentable_df.loc[presentable_df['% change'].str.contains('\+'),['word','count before', 'count after','per article before','per article after','% change',]]
presentable_df = presentable_df.head(30)


init_notebook_mode(all_interactive=False)
display(presentable_df.style.pipe(style_it).set_caption(f"Spiked terms after ChatGPT edit ({basic_stats.loc[0,'url']} articles)"))
init_notebook_mode(all_interactive=True)


word,count before,count after,per article before,per article after,% change
valuable,1032,5732,0.094,0.525,+455.43%
crucial,960,4995,0.088,0.457,+420.31%
essential,1175,4821,0.108,0.441,+310.30%
comprehensive,491,3181,0.045,0.291,+547.86%
insightful,171,1881,0.016,0.172,+1000.00%
intricate,44,1417,0.004,0.13,+3120.45%
innovative,225,1748,0.021,0.16,+676.89%
pivotal,35,993,0.003,0.091,+2737.14%
various,3236,6030,0.296,0.552,+86.34%
invaluable,72,1020,0.007,0.093,+1316.67%


In [21]:

presentable_df = top_1000_words_pct_df.copy()
presentable_df['word'] = presentable_df['word'].str[:30]

presentable_df = presentable_df.rename(columns ={'word count orig': 'count before', 'word count mod': 'count after', 'word pct orig': 'per article before',
       'word pct mod': 'per article after', '% relative change': '% change'})
presentable_df = presentable_df.sort_values(by='score',ascending=False)
presentable_df = presentable_df.loc[presentable_df['% change'].str.contains('\-'),['word','count before', 'count after','per article before','per article after','% change']]
presentable_df = presentable_df.head(30)


init_notebook_mode(all_interactive=False)
display(presentable_df.style.pipe(style_it).set_caption(f"Dropped terms after ChatGPT edit ({basic_stats.loc[0,'url']} articles)"))
init_notebook_mode(all_interactive=True)


word,count before,count after,per article before,per article after,% change
’,12250,105,1.121,0.01,-99.14%
other,14263,2974,1.305,0.272,-79.15%
s,7212,135,0.66,0.012,-98.13%
same,8654,754,0.792,0.069,-91.29%
many,9842,1329,0.901,0.122,-86.50%
more,15314,4135,1.402,0.378,-73.00%
different,12804,2918,1.172,0.267,-77.21%
“,6345,129,0.581,0.012,-97.97%
good,6303,289,0.577,0.026,-95.41%
first,9083,1595,0.831,0.146,-82.44%


In [51]:
# Extra script for comparing the trend with chatGPT modified scripts word occurence. 
# Need the pickle output from trend analysis script

with open('top_1000_words_pct_df_trend.pkl', 'rb') as f:
    top_1000_words_pct_df_trend = pickle.load(f)

top_1000_words_pct_df_trend = top_1000_words_pct_df_trend.loc[:,['% relative change']].rename(columns={'% relative change':'% change 2022 vs 2023'})

combined_df = pd.merge(top_1000_words_pct_df,top_1000_words_pct_df_trend, on='word',how='left')

combined_df = combined_df.rename(columns ={'word count orig': 'count before', 'word count mod': 'count after', 'word pct orig': 'per article before',
       'word pct mod': 'per article after', '% relative change': '% change chatGPT'})
combined_df = combined_df.sort_values(by='score',ascending=False)
combined_df = combined_df.loc[combined_df['% change chatGPT'].str.contains('\+'),['word','% change chatGPT','% change 2022 vs 2023']]

init_notebook_mode(all_interactive=False)
display(combined_df.style.pipe(style_it).set_caption("Comparing % change of chatGPT edit & [2022 VS 2023]"))
init_notebook_mode(all_interactive=True)


word,% change chatGPT,% change 2022 vs 2023
valuable,+455.43%,+173.39%
crucial,+420.31%,+330.07%
essential,+310.30%,+171.65%
comprehensive,+547.86%,+269.70%
insightful,+1000.00%,+220.75%
intricate,+3120.45%,+1000.00%
innovative,+676.89%,+148.00%
pivotal,+2737.14%,+1510.00%
various,+86.34%,+88.15%
invaluable,+1316.67%,+837.50%
