## Notebook to process the results for finetuning of open LLMs

### Idea: if using the answers from OpenAI API a model could be finetuned to give even better results and could be used offline

In [1]:
#import openai
from pandas import DataFrame, concat, read_csv, read_parquet,merge, ExcelWriter, read_excel
import requests
#from langchain.text_splitter import RecursiveCharacterTextSplitter
#from langchain.vectorstores import Chroma
#from langchain.embeddings import OpenAIEmbeddings
#from langchain.docstore.document import Document
#from azure.data.tables import TableServiceClient, TableEntity
#from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
import os
from io import BytesIO
from datetime import date
from tqdm import tqdm
from numpy import array, array_split, float32, set_printoptions
from multiprocessing import  Pool
import tiktoken
import re
from itertools import islice
import json
import ast



In [2]:
OUTLOOK_CONTENT_CONNECTION_STRING = os.environ.get('OUTLOOK_CONTENT_CONNECTION_STRING')

In [3]:
#get data from azure blob storage
def get_data(file_name):
    try:
        # Create the BlobServiceClient object which will be used
        blob_service_client = BlobServiceClient.from_connection_string(OUTLOOK_CONTENT_CONNECTION_STRING)

        container_name = 'outlookcontent'
        
        # Create a blob client using the local file name as the name for the blob
        blob_client = blob_service_client.get_blob_client(container=container_name, blob=file_name)
        
        #download blob
        blob = blob_client.download_blob()
        #convert blob to dataframe
        df = read_parquet(BytesIO(blob.readall()))
        
                
    except: 
        return "error downloading data from blob storage"

    else:
        return df

In [7]:
df1 = get_data('2023-07-12_outlook_summary_1000a.parquet')
df1.shape

In [8]:
df2 = get_data('2023-07-12_outlook_summary_1000b.parquet')
df2.shape

(997, 16)

In [37]:
df = concat([df1, df2])
df.shape

(2897, 16)

In [38]:
df.sort_index(inplace=True)

In [39]:
df.reset_index(inplace=True)
df.rename(columns={'index':'index_old'}, inplace=True)



In [None]:
df.head()

In [41]:
#analyze df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2897 entries, 0 to 2896
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype              
---  ------                   --------------  -----              
 0   index_old                2897 non-null   int64              
 1   PartitionKey             2897 non-null   object             
 2   RowKey                   2897 non-null   object             
 3   subject                  2897 non-null   object             
 4   content                  2897 non-null   object             
 5   sender                   2897 non-null   object             
 6   recipients               2897 non-null   object             
 7   received_datetime        2897 non-null   datetime64[ns, UTC]
 8   conversation_id          2897 non-null   object             
 9   web_link                 2897 non-null   object             
 10  content_cleaned          2897 non-null   object             
 11  content_length           2897 

In [42]:
df_error = df[df['finish_reason'] == 'Error']
df_error.shape

(0, 17)

In [43]:
df['finish_reason_summary'] = df['content_summary'].apply(lambda x: x["choices"][0]["finish_reason"])
df_summmary_error = df[df['finish_reason_summary'] == 'Error']
df_summmary_error.shape

(543, 18)

In [44]:
df = df[df['finish_reason_summary'] != 'Error']
df.shape

(2354, 18)

In [45]:
#content_summary to content to string
df.loc[:, 'summary'] = df['content_summary'].apply(lambda x: x["choices"][0]["message"]["content"])


In [48]:
df.reset_index(inplace=True, drop=True)

In [56]:
df['summary'] = df['summary'].str.replace("Max Mustermann, Erika Mustermann", "Unbekannt").str.replace("Max Mustermann", "Unbekannt").str.replace("Erika Mustermann", "Unbekannt").str.replace("Max Mustermann, Erika Musterfrau", "Unbekannt").str.replace("Max Mustermann, Erika Musterfrau", "Unbekannt").str.replace("Max Mustermann, Erika Musterfrau", "Unbekannt").str.replace("Max Mustermann, Erika Musterfrau", "Unbekannt").str.replace("Max Mustermann, Erika Musterfrau", "Unbekannt").str.replace("Max Mustermann, Erika Musterfrau", "Unbekannt").str.replace("Max Mustermann, Erika Musterfrau", "Unbekannt").str.replace("Max Mustermann, Erika Musterfrau", "Unbekannt").str.replace("Max Mustermann, Erika Musterfrau", "Unbekannt")

In [57]:
df['summary'].str.contains('Mustermann').sum()

1

In [63]:
df_out = df[['sender', 'content_cleaned', 'content_string', 'summary']]

In [64]:
with ExcelWriter('../sample_files/content_summary.xlsx') as writer:
    df_out.to_excel(writer, sheet_name='content_summary', index=False)

In [25]:
df_in = read_excel('../sample_files/content_summary.xlsx', sheet_name='content_summary')

In [None]:
df_in.head()

In [26]:
df_in.rename(columns={'content_cleaned':'input', 'content_string': 'output'}, inplace=True)

In [27]:
df_in['instruction'] = """Analysiere folgende Email-Unterhaltung, nach folgenden Kriterien: Sender, Gesendet, Betreff, Nachricht (nur Text, entferne Signaturen, Adressen, Bilder, Links, Disclaimer und Fussnoten), Typ (Frage, Antwort,Information, Aufforderung, Werbung...). Antwort in einer Liste. Einträge getrennt durch <br>. Format:
<br>
Typ: 
Sender: 
Gesendet: 
Subject:
Nachricht:
<br>
"""

In [28]:
#export columns instruction, input, output to jsonl file
df_in[['instruction', 'input', 'output']].to_json('../sample_files/content_proccessed.jsonl', orient='records', lines=True, force_ascii=False)


In [9]:
#drop instruction column
df_in.drop(columns=['instruction'], inplace=True)

In [21]:
df_in['instruction'] = """Erstelle eine Zusammenfassung der folgenden Email-Unterhaltung, inklusive der Personen, die daran beteiligt sind.
                Beispiel:
                Personen: 
                Zusammenfassung: In dieser Email-Unterhaltung geht es um..
                 """

In [22]:
df_in.rename(columns={'content_string':'input', 'summary': 'output'}, inplace=True)

In [24]:
#export columns instruction, input, output to jsonl file
df_in[['instruction', 'input', 'output']].to_json('../sample_files/content_summary.jsonl', orient='records', lines=True,force_ascii=False)

### add a dataset with instructions from HugginFace

In [3]:
import datasets
data = datasets.load_dataset('snipaid/instruct-snippet-mlsum-v2')

Downloading readme: 100%|██████████| 1.63k/1.63k [00:00<00:00, 18.0MB/s]


Downloading and preparing dataset csv/snipaid--instruct-snippet-mlsum-v2 to /home/bender/.cache/huggingface/datasets/snipaid___csv/snipaid--instruct-snippet-mlsum-v2-b5b17d046b68c730/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data: 100%|██████████| 10.8M/10.8M [00:00<00:00, 12.7MB/s]
Downloading data files: 100%|██████████| 1/1 [00:01<00:00,  1.93s/it]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 2344.50it/s]
                                                        

Dataset csv downloaded and prepared to /home/bender/.cache/huggingface/datasets/snipaid___csv/snipaid--instruct-snippet-mlsum-v2-b5b17d046b68c730/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 405.44it/s]


In [3]:
train_data = read_parquet('../sample_files/csv-train.parquet')


In [4]:
train_data.head()

Unnamed: 0,label,instruction,input,output
0,title,Welcher Titel würde den Kern des Artikels am b...,Die Cadillac des Jahrgangs 2005 haben nicht me...,Fahrbericht: Cadillac STS 4.6 V8 - Habt Acht!
1,title,Gib diesem Text eine ansprechende Überschrift.,Motorisierte Taxis gibt es schon fast so lange...,London Taxis - Black Cabs auf Tour
2,title,Welche Schlagzeile würde die Aufmerksamkeit de...,"GS, das war einmal: Hightech kompakt verpackt ...",Citroën C5 - Zeit-Maschine
3,title,Finde eine Überschrift.,Zoe ist ein altgriechisches Wort und bedeutet ...,Elektroauto Renault Zoe
4,title,Finde eine passende Überschrift für den folgen...,"""Zuletzt gesichtet vor Cape Flattery."" So lako...",Gefährliche Wasserstraßen - Hochspannung am Pu...


In [5]:
#train_data to jsonl file
train_data[['instruction', 'input', 'output']].to_json('../sample_files/instruct_snippet_mlsunV2.jsonl', orient='records', lines=True, force_ascii=False)

In [None]:
#function to upload data to azure blob storage
def upload_data(df):
    try:
        #Save to Azure Blob Storage
        # Create the BlobServiceClient object which will be used
        blob_service_client = BlobServiceClient.from_connection_string(OUTLOOK_CONTENT_CONNECTION_STRING)

        container_name = 'outlookcontent'
        #get today's date
        today = date.today().strftime('%Y-%m-%d')
        # Create a blob client using the local file name as the name for the blob
        file_name = today + "_outlook_data.parquet"
        blob_client = blob_service_client.get_blob_client(container=container_name, blob=file_name)
        
        # save dataframe to csv
        #csv_file = df.to_csv(index=False)

        parquet_file = BytesIO()
        df.to_parquet(parquet_file,  engine='pyarrow')
        parquet_file.seek(0)  # change the stream position back to the beginning after writing
        response = blob_client.upload_blob(data=parquet_file, overwrite=True)

        
    except:
        print("error uploading data to blob storage")
    else:
        return response


In [None]:
upload_data(df_normal)

In [None]:
set_printoptions(linewidth=100000)

In [None]:
df_normal = df._to_pandas()

In [None]:
df_normal.to_excel("outlook1_data.xlsx")

In [None]:
l = df_normal.content_processed.to_list()

In [None]:
df.iloc[140:150]

In [None]:
df_normal.to_csv("outlook1_data2.csv", index=False)

In [None]:
df.iloc[140:150].to_csv("test.csv",sep=';', encoding='utf-8', quotechar='"', index=False)

In [None]:
df.iloc[140]["content"]

In [None]:
df.iloc[142]["content_cleaned"]

In [None]:
df.iloc[11]["content_cleaned"]

In [None]:
df1.iloc[0]["content_processed"]["choices"][0]["message"]['content'].json.loads()

In [None]:
prompt = f"""
Analysiere folgende Email-Unterhaltung, getrennt durch dreifache Anführungsstrich, nach folgenden Kriterien:
- Sender
- Gesendet
- Betreff
- Nachricht (nur Text, keine Signaturen oder Fussnoten)
- Typ (Frage, Antwort, Information, Aufforderung, Werbung...)

Antwort als JSON-Objekte in einer Liste. Liste sortiert nach Datum Gesendet, älteste zuerst. JSON-Objekte mit den Kriterien als Keys und den entsprechenden Werten.

"""

In [None]:
num_tokens_from_string(prompt, "cl100k_base")

In [None]:
from IPython.display import display, HTML

def pretty_print(text):
    return display( HTML( text.replace("\\r","<br>") ) )

In [None]:
pretty_print(df.content_cleaned[11])

In [None]:
df.content_cleaned[1305]

In [None]:
import re

def split_string_by_email(text):
    # Use a regex to split the string at each 'Von:' followed by an email (up to the next '<')
    return re.split(r'Von:.*?<', text)

example_string = df.iloc[11]["content_cleaned"]  # Your string here

split_result = split_string_by_email(example_string)

for i, part in enumerate(split_result):
    print(f"Part {i}:")
    print(part)
    print()

In [None]:
df_download.shape

In [None]:
#clean json string to be able to convert to json
df_download["content_processed"] = df_download["content_processed"].apply(lambda x: x.replace("\'", '"'))

In [None]:
#convert string to json
df1["content_processed_content"] = df1["content_processed"].apply(lambda x: x["choices"])

In [None]:
import ast
ast.literal_eval(df1.iloc[1]["content_processed_content"]["choices"])

In [None]:
df1.iloc[1]["content_processed_content"]

In [None]:
prompt = f"""How is the regex for multiple new lines?"""

In [None]:
messages = [{"role": "user", "content": prompt}]
response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=messages,
    temperature=0, # this is the degree of randomness of the model's output
    max_tokens=4000, # this is the maximum number of tokens that the model will generate
    n=1, # this is the number of samples to return
)

In [None]:
response

In [None]:
df_temp = load_data()

In [None]:
df_temp.to_csv("outlook1_data3.csv", index=False)

In [None]:
df_save = df.copy()

In [None]:
df_save2 = df.copy()


In [None]:
#replace empty lists in content_processed with empty dict
df_save["content_processed"] = df_save["content_processed"].apply(lambda x: {} if x == [] else x)

In [None]:
df_save.to_parquet("outlook1_data.parquet", engine='pyarrow')