In [2]:
import pandas as pd
import glob

# create a list of all CSV files in the directory
files = glob.glob('*.csv')

# read each CSV file into a DataFrame and concatenate them
df = pd.concat([pd.read_csv(f) for f in files])

# print the concatenated DataFrame
print(df)


         MsgID              Timestamp                Author   ThreadID  \
0    'jexqzl1'  '2023-04-04 09:41:10'           'double-el'  '12bi4m5'   
1    'jexr43r'  '2023-04-04 09:41:59'              'TK_TK_'  '12bi4m5'   
2    'jexr5ss'  '2023-04-04 09:42:17'          'Cairpre409'  '12afjg5'   
3    'jexr5wx'  '2023-04-04 09:42:18'  'Robin_the_sidekick'  '12b0qu8'   
4    'jexr8n6'  '2023-04-04 09:42:46'             'jadraxx'  '12b68t9'   
..         ...                    ...                   ...        ...   
935  'jfcosr4'  '2023-04-07 11:53:56'        'GhostDweller'  '12ehlze'   
936  'jfcoqlu'  '2023-04-07 11:53:32'           'edubkendo'  '1200h65'   
937  'jfcnwdj'  '2023-04-07 11:47:52'           'kyledwray'  '12ev2x2'   
938  'jfcnevh'  '2023-04-07 11:44:33'        'hillbilly316'  '1200h65'   
939  'jfcm9dp'  '2023-04-07 11:36:47'           'Dexacrash'  '10cj6uw'   

                                           ThreadTitle  \
0          'Recipe for a vegan at an outdoor barbecue

In [3]:


# convert 'Timestamp' column to datetime format


# drop 'MsgID', 'ThreadID', 'ReplyTo' and 'Permalink' columns as they might not be useful for analysis
df = df.drop(['MsgID', 'ThreadID', 'ReplyTo', 'Permalink'], axis=1)

# rename 'Author', 'ThreadTitle', and 'MsgBody' columns for clarity
df = df.rename(columns={'Author': 'AuthorName', 'ThreadTitle': 'DiscussionTitle', 'MsgBody': 'DiscussionBody'})

# drop any rows with missing values
df = df.dropna()

# reset the index
df = df.reset_index(drop=True)


In [4]:
df = df[[' Timestamp', 'AuthorName', 'DiscussionTitle', 'DiscussionBody']]

# clean the Timestamp column
df[' Timestamp'] = df[' Timestamp'].str.replace("'", "")

# clean the Timestamp column
df[' Timestamp'] = pd.to_datetime(df[' Timestamp'], format='%Y-%m-%d %H:%M:%S')


# print the first few rows to check
print(df.head())
print(df.dtypes)

            Timestamp            AuthorName  \
0 2023-04-04 09:41:10           'double-el'   
1 2023-04-04 09:41:59              'TK_TK_'   
2 2023-04-04 09:42:17          'Cairpre409'   
3 2023-04-04 09:42:18  'Robin_the_sidekick'   
4 2023-04-04 09:42:46             'jadraxx'   

                                     DiscussionTitle  \
0        'Recipe for a vegan at an outdoor barbecue'   
1        'Recipe for a vegan at an outdoor barbecue'   
2  'Weekly Food Safety Questions Thread - April 0...   
3  'Can I make a passable pasta sauce with tomato...   
4      'Whats your biscuits and gravy gravy recipe?'   

                                      DiscussionBody  
0          'It changes the texture to be more meaty'  
1           'Oh I’m definitely trying this! Thanks!'  
2  '   How long does homemade remoulade really la...  
3  'You will need a little whole grain to make a ...  
4  'Yup this is the basic recipe I use. Then I us...  
 Timestamp         datetime64[ns]
AuthorName      

In [5]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from textblob import TextBlob
import pandas as pd

# load the dataset into a pandas dataframe


# clean the text data
stop_words = set(stopwords.words('english'))
df['cleaned_text'] = df['DiscussionBody'].apply(lambda x: ' '.join([word for word in x.lower().split() if word not in stop_words]))

# apply sentiment analysis using TextBlob
df['sentiment'] = df['cleaned_text'].apply(lambda x: TextBlob(x).sentiment.polarity)

# print the overall sentiment score of each DiscussionBody
print(df[['DiscussionBody', 'sentiment']])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


                                         DiscussionBody  sentiment
0             'It changes the texture to be more meaty'   0.000000
1              'Oh I’m definitely trying this! Thanks!'   0.125000
2     '   How long does homemade remoulade really la...   0.002423
3     'You will need a little whole grain to make a ...   0.022500
4     'Yup this is the basic recipe I use. Then I us...   0.210000
...                                                 ...        ...
4220                                       'Looks good'   0.700000
4221  'I hope you enjoy it. Let me know how it goes ...   0.400000
4222  '**Ingredients:**  2 Tbsp. Olive Oil (or Vegan...   0.174242
4223            'This looks very yummy im gonna try it'   0.000000
4224                                             'Buss'   0.000000

[4225 rows x 2 columns]


In [6]:
!pip install vaderSentiment
import nltk
nltk.download('stopwords')


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
!pip install transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m92.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [8]:
print(df.dtypes)

 Timestamp         datetime64[ns]
AuthorName                 object
DiscussionTitle            object
DiscussionBody             object
cleaned_text               object
sentiment                 float64
dtype: object


In [9]:
df.head(10)

Unnamed: 0,Timestamp,AuthorName,DiscussionTitle,DiscussionBody,cleaned_text,sentiment
0,2023-04-04 09:41:10,'double-el','Recipe for a vegan at an outdoor barbecue','It changes the texture to be more meaty','it changes texture meaty',0.0
1,2023-04-04 09:41:59,'TK_TK_','Recipe for a vegan at an outdoor barbecue','Oh I’m definitely trying this! Thanks!','oh i’m definitely trying this! thanks!',0.125
2,2023-04-04 09:42:17,'Cairpre409','Weekly Food Safety Questions Thread - April 0...,' How long does homemade remoulade really la...,' long homemade remoulade really last? red oni...,0.002423
3,2023-04-04 09:42:18,'Robin_the_sidekick','Can I make a passable pasta sauce with tomato...,'You will need a little whole grain to make a ...,'you need little whole grain make complete pro...,0.0225
4,2023-04-04 09:42:46,'jadraxx','Whats your biscuits and gravy gravy recipe?','Yup this is the basic recipe I use. Then I us...,'yup basic recipe use. usually add crushed hab...,0.21
5,2023-04-04 09:43:00,'MuppetManiac','How do people make dinner so quickly? I strug...,'What are you making? I can throw together a b...,'what making? throw together baked chicken dis...,-0.266667
6,2023-04-04 09:43:32,'under_rain_gutters','my tomato sauce wont sweeten no matter how lo...,'The problem with adding the sugar is it reali...,'the problem adding sugar realistically doesn’...,0.275
7,2023-04-04 09:43:36,'Pleasant_Choice_6130','mashed potatos with cream?','Yup me too. Ive also used buttermilk sour cre...,'yup too. ive also used buttermilk sour cream ...,-0.091071
8,2023-04-04 09:43:47,'stealthcake20','Why does restaurant miso soup taste superior?','It also helps to quit out of apps that track....,'it also helps quit apps track. including redd...,-0.133333
9,2023-04-04 09:44:09,'aquadragon19','Is it worth it to make your own pasta?','I think it depends. I live with my bf and alw...,'i think depends. live bf always make homemade...,0.142273


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (524 > 512). Running this sequence through the model will result in indexing errors


Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

KeyboardInterrupt: ignored

In [11]:
import pandas as pd

# assume that your recipe data is loaded into a pandas dataframe called `recipe_df`
df.to_csv('recipe_sentiment.csv', index=False)


In [12]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

# Preprocess text (username and link placeholders)
# def preprocess(text):
#     new_text = []
 
 
#     for t in text.split(" "):
#         t = '@user' if t.startswith('@') and len(t) > 1 else t
#         t = 'http' if t.startswith('http') else t
#         new_text.append(t)
#     return " ".join(new_text)

# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)
df['positive'] = 0.0
df['neutral'] = 0.0
df['negative'] = 0.0

for ind in df.index:
    print(df['DiscussionBody'][ind])
    text = df['DiscussionBody'][ind]
# text = "I dont like cars, they are accident prone"
    # text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512, add_special_tokens = True)
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)


    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    for i in range(scores.shape[0]):
      l = labels[ranking[i]]
      s = scores[ranking[i]]
      print(f"{i+1}) {l} {np.round(float(s), 4)}")
      df[l][ind] = np.round(float(s), 4)

    # if ind > 10:
    #   break


# df.head(10)
df_result = df.groupby(by = 'DiscussionTitle', as_index=False).positive.agg('mean')
df_result
df_result_final = df_result.sort_values(by='positive', ascending=False)
df_result_final.head(10)





Downloading (…)lve/main/config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

'It changes the texture to be more meaty'
1) neutral 0.865
2) negative 0.0741
3) positive 0.0608
'Oh I’m definitely trying this! Thanks!'


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[l][ind] = np.round(float(s), 4)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
'He says he did but the picture says otherwise.'
1) neutral 0.6211
2) negative 0.339
3) positive 0.0399
'Welcome to /r/ketorecipes! Please be sure to include a detailed recipe in your post (this means **quantities** **full instructions** and **in plain text**) or in the comments not *only* a link to the recipe or it will be removed per the sub rules!* For details you can find our [community rules here](https://www.reddit.com/r/ketorecipes/wiki/index) and the [Keto FAQs here](https://www.reddit.com/r/keto/wiki/faq). Please report any rule-violations to the moderators and keep doing the lards work!    *I am a bot and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/ketorecipes) if you have any questions or concerns.*'
1) neutral 0.6053
2) negative 0.2455
3) positive 0.1492
'I wouldn’t waste it. Rancid fat is perfectly safe. It’s just not as pleasant.'
1) nega

Unnamed: 0,DiscussionTitle,positive
150,'Ciceri e Tria Vegan Pugliese Pasta with Chick...,0.9911
704,'What is the best Lasagna recipe?',0.9888
361,'I made Copy Me That to copy and manage recipe...,0.9882
794,'request: Keto sloppy joes recipe? even just a...,0.9879
602,'Stress Busting Mocha Shake - perfect for bulk...,0.9878
165,'Creamy Lemon-Arugula Pasta',0.9866
450,'Low calorie Wrap🌯',0.9853
376,'If you have any interest in Golden Milk its m...,0.9829
673,'Vegan Puerto Rican Rice (Arroz con Gandules)',0.9825
338,'How to cook creamy Pasta e Fagioli bu Vito Ch...,0.9802
