### Import Data

In [1]:
import pandas as pd
df = pd.read_csv('./data/ArgKP21+predictions(v3).csv')
print(df.columns)

Index(['topic', 'stance', 'arguments', 'kep_points', 'predict_kps'], dtype='object')


### Predict via LLama2

In [None]:
import yaml
with open("../conf/index.yaml") as f:
    credentials = yaml.safe_load(f)
Llama2_api_token = credentials['environment_variables']['LLAMA2_API_TOKEN']

In [4]:
import ast
import re
def generating(topic, argument):
    import replicate
    replicate = replicate.Client(api_token=Llama2_api_token)
    output = replicate.run(
        "meta/llama-2-70b-chat",
        input={
            "debug": False,
            "top_p": 1,
            "prompt": argument,
            "temperature": 0.5,
            "system_prompt": f"""
        You need to do key point analysis on a set of arguments from user. They are all about the topic "{topic}" You need to generate 5 sentences to summarize all arguments and return them.
        """,
            "max_new_tokens": 500,
            "min_new_tokens": -1,
            "prompt_template": "[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n{prompt} [/INST]",
            "repetition_penalty": 1.15
        }
    )
    print(output)
    # The meta/llama-2-70b-chat model can stream output as it's running.
    # The predict method returns an iterator, and you can iterate over that output.
    for item in output:
        print(item, end="")
    extracted_sentences = []
    try:
        if not any(['\n\n1' in output, '\n\n*' in output]):
            print("Neither '\n\n1' nor '\n\n*' found in output. Regenerating string...")
            return generating(topic, argument)
        else:
            if '\n\n1' in output:
                indices = [
                (output.index('\n\n1'), output.index('\n2')),
                (output.index('\n2'), output.index('\n3')),
                (output.index('\n3'), output.index('\n4')),
                (output.index('\n4'), output.index('\n5')),
                (output.index('\n5'), len(output))
                ]
                # Extract the sentences using the indices
                for start, end in indices:
                    sentence = ''.join(output[start+2:end])
                    extracted_sentences.append(sentence.strip())
            elif '\n\n*' in output:
                    # Join the list into a single string
                joined_text = ' '.join(output)
                first_delimiter_index = joined_text.find('\n*')
                right_part = joined_text[first_delimiter_index:]
                extracted_sentences = [sentence.strip() for sentence in right_part.split('\n*')[1:] if sentence.strip()]
            # else:
            #     output = ' '.join(output)
            #     # Find the index of the end of the introductory phrase
            #     intro_end_index = output.find(":") + 2
            #     # Extract the main text after the introductory phrase
            #     main_text = output[intro_end_index:]
            #     # Use regular expression to split the main text into sentences
            #     sentences = re.split(r'(?<=[.!?])\s+', main_text)
            #     extracted_sentences = [sentence.strip() for sentence in sentences]
            # Check if the list can transform in str of standard list
            str_final_list = str(extracted_sentences)
            list_test = ast.literal_eval(str_final_list)
            return list_test
    except (SyntaxError, ValueError) as e:
        print(f"Error: {e}. Regenerating string...")
        return generating(topic, argument)


In [5]:
for index, row in df.iloc[50:].iterrows():
    print(str(index)+": ")
    topic = row['topic']
    argument = row['arguments']
    result = generating(topic, argument)
    print("\n")
    print(len(result))
    str_result = str(result)
    df.at[index, 'predict_kps'] = str_result

# df.to_csv('./data/ArgKP21+predictions(v3).csv', index=False)
# print("Add new data successfully!!!!!")



50: 
[' Sure', ',', ' here', ' are', ' ', '5', ' sentences', ' summar', 'izing', ' the', ' arguments', ' for', ' subs', 'id', 'izing', ' journal', 'ism', ':', '\n', '\n', 'J', 'ournal', 'ism', ' plays', ' a', ' vital', ' role', ' in', ' dem', 'ocracy', ' by', ' keeping', ' the', ' public', ' informed', ' and', ' holding', ' those', ' in', ' power', ' account', 'able', ',', ' and', ' subs', 'id', 'izing', ' it', ' would', ' ensure', ' that', ' this', ' important', ' work', ' continues', '.', ' Sub', 's', 'id', 'izing', ' journal', 'ism', ' would', ' allow', ' for', ' more', ' in', '-', 'depth', ' reporting', ' and', ' a', ' wider', ' range', ' of', ' view', 'points', ',', ' as', ' well', ' as', ' help', ' to', ' combat', ' the', ' influence', ' of', ' fake', ' news', ' and', ' bi', 'ased', ' sources', '.', ' Additionally', ',', ' subs', 'id', 'ies', ' could', ' help', ' to', ' support', ' local', ' journal', 'ism', ' and', ' ensure', ' that', ' communities', ' have', ' access', ' to', '

KeyboardInterrupt: 

In [19]:
df.to_csv('./data/ArgKP21+predictions(v3).csv', index=False)
print("Add new data successfully!!!!!")

Add new data successfully!!!!!


### Check chinese Characters

In [1]:
import pandas as pd
df = pd.read_csv('./data/ArgKP21+predictions(v3).csv')
print(df.columns)

Index(['topic', 'stance', 'arguments', 'kep_points', 'predict_kps'], dtype='object')


In [2]:
import re
def contains_chinese(text):
    # Regular expression pattern for Chinese characters
    chinese_pattern = re.compile('[\u4e00-\u9fff]+')

    # Search for Chinese characters in the text
    if chinese_pattern.search(text):
        return True
    else:
        return False

In [3]:
import ast
for index, row in df.iterrows():
    print(str(index)+": ")
    kps = ast.literal_eval(df.at[index,'predict_kps'])
    for kp in kps:
        if contains_chinese(kp): print("Yes")
        # else: print("No")

0: 
1: 
2: 
3: 
4: 
5: 
6: 
7: 
8: 
9: 
10: 
11: 
12: 
13: 
14: 
15: 
16: 
17: 
18: 
19: 
20: 
21: 
22: 
23: 
24: 
25: 
26: 
27: 
28: 
29: 
30: 
31: 
32: 
33: 
34: 
35: 
36: 
37: 
38: 
39: 
40: 
41: 
42: 
43: 
44: 
45: 
46: 
47: 
48: 
49: 
50: 
51: 
52: 
53: 
54: 
55: 
56: 
57: 
58: 
59: 
60: 
61: 
