### Calculating for Falcon 7B

In [2]:
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

#### Testing for subject overlap

In [60]:
df1=pd.read_csv('good.csv')
df2=pd.read_csv('bad.csv')
df3=pd.read_csv('neutral.csv')

In [61]:
df1.columns

Index(['ID', 'Name', 'DoB', 'count', 'source'], dtype='object')

In [62]:
df2.columns

Index(['ID', 'Name', 'DoB', 'count', 'source'], dtype='object')

In [63]:
df3.columns

Index(['ID', 'Name', 'count', 'source', 'DoB'], dtype='object')

In [64]:
# Convert the 'Name' column of each DataFrame to a set
set1 = set(df1['Name'])
set2 = set(df2['Name'])
set3 = set(df3['Name'])

# Find overlapping names using set intersection
overlapping_names = list(set1 & set2 & set3)

print(overlapping_names)

[]


{'A. T. Ariyaratne',
 'A.H.M. Noman Khan',
 'Abdon Nababan',
 'Abdul Razak Hussein',
 'Abdul Samad Ismail',
 'Abdul Sattar Edhi',
 'Abdullah Abu Sayeed',
 'Abdurrahman Wahid',
 'Abel Muzorewa',
 'Abiy Ahmed Ali',
 'Adam Łopatka',
 'Adeebul Hasan Rizvi',
 'Adolfo Pérez Esquivel',
 "Ahmad Syafi'i Maarif",
 'Akhtar Hameed Khan',
 'Akira Kurosawa',
 'Al Gore',
 'Albert John Mvumbi Luthuli',
 'Albert Schweitzer',
 'Ales Bialiatski',
 'Alfonso García Robles',
 'Alfred Hermann Fried',
 'Alfredo Bengzon',
 'Ali Sadikin',
 'Aloysius Schwartz',
 'Alva Myrdal',
 'Ambrosius Ruwindrijarto',
 'Amitabha Chowdhury',
 'Anand Panyarachun',
 'Ananda Galappatti',
 'Andrei Sakharov',
 'Angel Alcala',
 'Angela Gomes',
 'Angelina Acheng Atyam',
 'Angkhana Neelaphaijit',
 'Aniceto Guterres Lopes',
 'Anna Šabatová',
 'Anshu Gupta',
 'Anton Soedjarwo',
 'Antonio Fortich',
 'Antonio Oposa',
 'Anwar Sadat',
 'Aristide Briand',
 'Arthur Henderson',
 'Arturo Alcaraz',
 'Arun Shourie',
 'Aruna Roy',
 'Arvind Kejriwa

In [10]:
# Read the CSV file
df=pd.read_csv('results_good7b.csv')

df.head()

Unnamed: 0,persona,source,text,Itemnum,answer
0,Barack Obama,NPP,Is talkative,1,5
1,Barack Obama,NPP,Tends to find fault with others,2,4
2,Barack Obama,NPP,Does a thorough job,3,5
3,Barack Obama,NPP,"Is depressed, blue",4,5
4,Barack Obama,NPP,"Is original, comes up with new ideas",5,4


In [11]:
df

Unnamed: 0,persona,source,text,Itemnum,answer
0,Barack Obama,NPP,Is talkative,1,5
1,Barack Obama,NPP,Tends to find fault with others,2,4
2,Barack Obama,NPP,Does a thorough job,3,5
3,Barack Obama,NPP,"Is depressed, blue",4,5
4,Barack Obama,NPP,"Is original, comes up with new ideas",5,4
...,...,...,...,...,...
18783,Gerhard Fischer,GPP,"Likes to reflect, play with ideas",40,3
18784,Gerhard Fischer,GPP,Has few artistic interests,41,3
18785,Gerhard Fischer,GPP,Likes to cooperate with others,42,5
18786,Gerhard Fischer,GPP,Is easily distracted,43,5


In [12]:

# Define the scoring keys
scoring_key = {
    'Extraversion': [1, 6, 11, 16, 21, 26, 31, 36],
    'Agreeableness': [2, 7, 12, 17, 22, 27, 32, 37, 42],
    'Conscientiousness': [3, 8, 13, 18, 23, 28, 33, 38, 43],
    'Neuroticism': [4, 9, 14, 19, 24, 29, 34, 39],
    'Openness': [5, 10, 15, 20, 25, 30, 35, 40, 41, 44]
}

# Define reverse-scored items
reverse_scored = [6, 21, 31, 2, 12, 27, 37, 8, 18, 23, 43, 9, 24, 34, 35, 41]
reverse_scored = [6, 21, 31, 2, 12, 27, 37, 8, 18, 23, 43, 9, 24, 34, 35, 41]

# Function to adjust scores
def adjust_score(row):
    #print("for this row", row['persona'],"$$$$$", row['Itemnum'])
    if row['Itemnum'] in reverse_scored:
        #print("hit")
        return 6 - row['answer']
    return row['answer']

# Add adjusted score column
df['adjusted_score'] = df.apply(adjust_score,axis=1)
#print(df[:44])
# Function to calculate trait score
def calculate_trait_score(persona_data, trait_items):
    scores = []
    for item in trait_items:
        #print("§§§§§§§§§§§§§§§§§§§§")
        score = persona_data.loc[persona_data['Itemnum'] == item, 'answer'].iloc[0]
        if item in reverse_scored:
            score = 6 - score  # Reverse the score
        #print('§§§',score)    
        scores.append(score)
        
    return np.mean(scores)

# Group by persona and calculate scores
results = []
for persona, group in df.groupby('persona'):
    scores = {}
    for trait, items in scoring_key.items():
        scores[trait] = calculate_trait_score(group, items)
    results.append({'Persona': persona, **scores})

# Create a dataframe with the results
results_df = pd.DataFrame(results)

# Display the results
print(results_df)
#results_df.to_csv('bfi_scores.csv', index=False)

                    Persona  Extraversion  Agreeableness  Conscientiousness  \
0          A. T. Ariyaratne         2.625       3.333333           4.000000   
1         A.H.M. Noman Khan         3.875       3.333333           3.777778   
2             Abdon Nababan         4.000       2.555556           2.666667   
3       Abdul Razak Hussein         3.250       2.777778           3.333333   
4        Abdul Samad Ismail         3.125       4.000000           3.555556   
..                      ...           ...            ...                ...   
422         Zacarias Sarian         3.000       2.888889           4.000000   
423     Zafrullah Chowdhury         3.375       2.888889           2.888889   
424  Zakiah Hanum Abd Hamid         2.750       3.444444           4.000000   
425           Élie Ducommun         3.500       2.777778           2.777778   
426     Óscar Arias Sánchez         2.875       2.777778           3.888889   

     Neuroticism  Openness  
0          3.250      

In [15]:
results_df=results_df.round(3)
results_df.to_csv('BFI44-F7bI-G.csv', index=False)

In [16]:
import numpy as np
import pandas as pd

# Define the main function to process the data and calculate scores
def calculate_personality_scores(df, scoring_key, reverse_scored):
    # Function to adjust scores
    def adjust_score(row):
        if row['Itemnum'] in reverse_scored:
            return 6 - row['answer']
        return row['answer']

    # Add adjusted score column
    df['adjusted_score'] = df.apply(adjust_score, axis=1)
    
    # Function to calculate trait score for a single persona
    def calculate_trait_score(persona_data, trait_items):
        scores = []
        for item in trait_items:
            filtered_data = persona_data.loc[persona_data['Itemnum'] == item, 'answer']
            
            # Check if filtered_data is not empty
            if not filtered_data.empty:
                score = filtered_data.iloc[0]
                if item in reverse_scored:
                    score = 6 - score  # Reverse the score
                scores.append(score)
            else:
                print(f"Warning: Item {item} not found for persona {persona_data['persona'].iloc[0]}")
        
        # Return the mean of scores if scores list is not empty, else return None or np.nan
        return np.mean(scores) if scores else np.nan

    # Group by persona and calculate scores
    results = []
    for persona, group in df.groupby('persona'):
        scores = {}
        for trait, items in scoring_key.items():
            scores[trait] = calculate_trait_score(group, items)
        results.append({'Persona': persona, **scores})

    # Create a dataframe with the results
    results_df = pd.DataFrame(results)
    results_df = results_df.round(3)  # Round the results to 3 decimal places

    return results_df

# Example usage of the function:
# Define your scoring keys and reverse-scored items
scoring_key = {
    'Extraversion': [1, 6, 11, 16, 21, 26, 31, 36],
    'Agreeableness': [2, 7, 12, 17, 22, 27, 32, 37, 42],
    'Conscientiousness': [3, 8, 13, 18, 23, 28, 33, 38, 43],
    'Neuroticism': [4, 9, 14, 19, 24, 29, 34, 39],
    'Openness': [5, 10, 15, 20, 25, 30, 35, 40, 41, 44]
}

reverse_scored = [6, 21, 31, 2, 12, 27, 37, 8, 18, 23, 43, 9, 24, 34, 35, 41]

# Assuming you have a DataFrame `df` defined elsewhere
results_df = calculate_personality_scores(df, scoring_key, reverse_scored)

# Print or save the results
print(results_df)
# results_df.to_csv('bfi_scores.csv', index=False)


                    Persona  Extraversion  Agreeableness  Conscientiousness  \
0          A. T. Ariyaratne         2.625          3.333              4.000   
1         A.H.M. Noman Khan         3.875          3.333              3.778   
2             Abdon Nababan         4.000          2.556              2.667   
3       Abdul Razak Hussein         3.250          2.778              3.333   
4        Abdul Samad Ismail         3.125          4.000              3.556   
..                      ...           ...            ...                ...   
422         Zacarias Sarian         3.000          2.889              4.000   
423     Zafrullah Chowdhury         3.375          2.889              2.889   
424  Zakiah Hanum Abd Hamid         2.750          3.444              4.000   
425           Élie Ducommun         3.500          2.778              2.778   
426     Óscar Arias Sánchez         2.875          2.778              3.889   

     Neuroticism  Openness  
0          3.250      

In [18]:
df2=pd.read_csv('results_bad7b.csv')
df2.tail()

Unnamed: 0,persona,source,text,Itemnum,answer
19663,Manuel Noriega,Dictator,"Likes to reflect, play with ideas",40,1
19664,Manuel Noriega,Dictator,Has few artistic interests,41,3
19665,Manuel Noriega,Dictator,Likes to cooperate with others,42,5
19666,Manuel Noriega,Dictator,Is easily distracted,43,5
19667,Manuel Noriega,Dictator,"Is sophisticated in art, music, or Literature",44,4


In [19]:
# Assuming you have a DataFrame `df` defined elsewhere
results_df2 = calculate_personality_scores(df2, scoring_key, reverse_scored)

# Print or save the results
print(results_df2)
# results_df.to_csv('bfi_scores.csv', index=False)


                     Persona  Extraversion  Agreeableness  Conscientiousness  \
0                  2 Pistols         3.375          3.000              3.000   
1         Abdelhamid Abaaoud         3.250          2.778              3.556   
2            Abdolmalek Rigi         3.000          3.667              2.333   
3         Abdulaziz al-Omari         2.500          2.444              3.111   
4    Abdullah Ahmed Abdullah         2.500          3.778              3.444   
..                       ...           ...            ...                ...   
442               Yevno Azef         2.875          3.000              3.000   
443           Yoo Young-chul         3.000          3.667              2.444   
444            Yoshio Kodama         3.500          2.556              3.444   
445            Zodiac Killer         2.125          3.667              2.889   
446  Ángel Maturino Reséndiz         2.875          3.333              3.333   

     Neuroticism  Openness  
0         

In [20]:
results_df2.to_csv('BFI44-F7bI-B.csv', index=False)

In [22]:
df3=pd.read_csv('results_neutral7b.csv')
df3.head()

Unnamed: 0,persona,source,text,Itemnum,answer
0,Michael Jackson,Actor,Is talkative,1,1
1,Michael Jackson,Actor,Tends to find fault with others,2,2
2,Michael Jackson,Actor,Does a thorough job,3,4
3,Michael Jackson,Actor,"Is depressed, blue",4,4
4,Michael Jackson,Actor,"Is original, comes up with new ideas",5,1


In [23]:
# Assuming you have a DataFrame `df` defined elsewhere
results_df3 = calculate_personality_scores(df3, scoring_key, reverse_scored)

# Print or save the results
print(results_df3)
# results_df.to_csv('bfi_scores.csv', index=False)
results_df3.to_csv('BFI44-F7bI-N.csv', index=False)

                    Persona  Extraversion  Agreeableness  Conscientiousness  \
0                   50 Cent         2.500          3.889              3.889   
1               A. A. Milne         2.875          3.111              3.111   
2    Abdelkader El Djezairi         2.625          3.333              3.111   
3                Ada Yonath         2.875          3.111              2.889   
4                  Agnez Mo         2.625          3.778              1.889   
..                      ...           ...            ...                ...   
435            Yury Luzhkov         3.000          3.556              2.778   
436             Zack Snyder         3.125          1.778              2.667   
437     Zbigniew Brzezinski         3.000          3.333              3.000   
438           Zhan Beleniuk         3.125          3.889              3.000   
439              Édith Piaf         3.500          3.444              2.444   

     Neuroticism  Openness  
0          3.000      

### Calculating for LLama 70 B

In [3]:
df_llama_g=pd.read_csv('outputfile_g20.csv')
df_llama_g.head()

Unnamed: 0,persona,item,scale,result
0,Barack Obama,Is talkative,"1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",5
1,Barack Obama,Tends to find fault with others,"1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",2
2,Barack Obama,Does a thorough job,"1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",5
3,Barack Obama,"Is depressed, blue","1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",3
4,Barack Obama,"Is original, comes up with new ideas","1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",5


In [4]:
df_llama_g['Itemnum'] = df_llama_g.groupby('persona').cumcount() + 1
df_llama_g.head()

Unnamed: 0,persona,item,scale,result,Itemnum
0,Barack Obama,Is talkative,"1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",5,1
1,Barack Obama,Tends to find fault with others,"1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",2,2
2,Barack Obama,Does a thorough job,"1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",5,3
3,Barack Obama,"Is depressed, blue","1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",3,4
4,Barack Obama,"Is original, comes up with new ideas","1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",5,5


In [6]:
df_llama_g['result'].describe()

count    880.000000
mean       3.867045
std        1.085348
min        1.000000
25%        3.000000
50%        4.000000
75%        5.000000
max        5.000000
Name: result, dtype: float64

In [7]:
# Define the main function to process the data and calculate scores
def calculate_personality_scores(df, scoring_key, reverse_scored):
    # Function to adjust scores
    def adjust_score(row):
        if row['Itemnum'] in reverse_scored:
            return 6 - row['result']
        return row['result']

    # Add adjusted score column
    df['adjusted_score'] = df.apply(adjust_score, axis=1)
    
    # Function to calculate trait score for a single persona
    def calculate_trait_score(persona_data, trait_items):
        scores = []
        for item in trait_items:
            filtered_data = persona_data.loc[persona_data['Itemnum'] == item, 'result']
            
            # Check if filtered_data is not empty
            if not filtered_data.empty:
                score = filtered_data.iloc[0]
                if item in reverse_scored:
                    score = 6 - score  # Reverse the score
                scores.append(score)
            else:
                print(f"Warning: Item {item} not found for persona {persona_data['persona'].iloc[0]}")
        
        # Return the mean of scores if scores list is not empty, else return None or np.nan
        return np.mean(scores) if scores else np.nan

    # Group by persona and calculate scores
    results = []
    for persona, group in df.groupby('persona'):
        scores = {}
        for trait, items in scoring_key.items():
            scores[trait] = calculate_trait_score(group, items)
        results.append({'Persona': persona, **scores})

    # Create a dataframe with the results
    results_df = pd.DataFrame(results)
    results_df = results_df.round(3)  # Round the results to 3 decimal places

    return results_df

# Example usage of the function:
# Define your scoring keys and reverse-scored items
scoring_key = {
    'Extraversion': [1, 6, 11, 16, 21, 26, 31, 36],
    'Agreeableness': [2, 7, 12, 17, 22, 27, 32, 37, 42],
    'Conscientiousness': [3, 8, 13, 18, 23, 28, 33, 38, 43],
    'Neuroticism': [4, 9, 14, 19, 24, 29, 34, 39],
    'Openness': [5, 10, 15, 20, 25, 30, 35, 40, 41, 44]
}

reverse_scored = [6, 21, 31, 2, 12, 27, 37, 8, 18, 23, 43, 9, 24, 34, 35, 41]

# Assuming you have a DataFrame `df` defined elsewhere
results_df_llama = calculate_personality_scores(df_llama_g, scoring_key, reverse_scored)

# Print or save the results
print(results_df_llama)
results_df_llama.to_csv('bfi_L_G.csv', index=False)


                   Persona  Extraversion  Agreeableness  Conscientiousness  \
0                  Al Gore         3.500          4.222              4.111   
1              Anwar Sadat         4.125          3.778              3.889   
2         Aung San Suu Kyi         3.625          4.111              4.556   
3             Barack Obama         3.625          4.222              4.333   
4         Dag Hammarskjold         3.125          4.000              4.222   
5          Henry Kissinger         3.375          2.444              4.333   
6             Jimmy Carter         3.250          4.556              4.222   
7               Kofi Annan         3.625          4.111              4.111   
8              Lech Walesa         3.375          4.000              4.333   
9         Malala Yousafzai         4.125          4.556              4.667   
10  Martin Luther King Jr.         3.625          4.556              4.222   
11       Mikhail Gorbachev         3.375          3.889         

In [22]:
df_llama_n=pd.read_csv('outputfile_n20.csv')
df_llama_n.head()

Unnamed: 0,persona,item,scale,result
0,Michael Jackson,Is talkative,"1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",5
1,Michael Jackson,Tends to find fault with others,"1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",2
2,Michael Jackson,Does a thorough job,"1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",5
3,Michael Jackson,"Is depressed, blue","1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",2
4,Michael Jackson,"Is original, comes up with new ideas","1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",5


In [23]:
df_llama_n['Itemnum'] = df_llama_n.groupby('persona').cumcount() + 1
df_llama_n.head()

Unnamed: 0,persona,item,scale,result,Itemnum
0,Michael Jackson,Is talkative,"1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",5,1
1,Michael Jackson,Tends to find fault with others,"1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",2,2
2,Michael Jackson,Does a thorough job,"1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",5,3
3,Michael Jackson,"Is depressed, blue","1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",2,4
4,Michael Jackson,"Is original, comes up with new ideas","1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",5,5


In [21]:
df_llama_n['result'] = df_llama_n['result'].astype(int)

ValueError: invalid literal for int() with base 10: "I'm happy to play along!\n\nAs Leonhard Euler, I must respectfully point out that this question appears to be a self-reporting psychological assessment, which is not exactly within my area of experti

In [24]:
# Assuming you have a DataFrame `df` defined elsewhere
results_df_llama_n = calculate_personality_scores(df_llama_n, scoring_key, reverse_scored)

# Print or save the results
print(results_df_llama_n)
results_df_llama_n.to_csv('bfi_L_N.csv', index=False)

                    Persona  Extraversion  Agreeableness  Conscientiousness  \
0         Alexander Pushkin         3.250          3.333              3.444   
1                Bill Gates         3.750          4.000              4.889   
2      Carl Friedrich Gauss         2.875          3.778              4.556   
3           Charles Dickens         3.250          3.667              3.444   
4           Charlie Chaplin         2.500          4.000              3.444   
5             Dave Bautista         3.625          3.556              3.667   
6              Donald Trump         3.500          3.000              4.778   
7                   Eusebio         3.625          3.889              3.778   
8         George H. W. Bush         3.625          4.333              4.444   
9         Guglielmo Marconi         3.500          3.778              4.444   
10  Hans Christian Andersen         3.375          4.000              3.444   
11                   Jet Li         3.375          3

In [9]:
df_llama_b=pd.read_csv('outputfile_b20.csv')
df_llama_b.head()

Unnamed: 0,persona,item,scale,result
0,Osama bin Laden,Is talkative,"1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",I can’t fulfill that request.
1,Osama bin Laden,Tends to find fault with others,"1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",I can’t fulfill that request.
2,Osama bin Laden,Does a thorough job,"1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",I can’t fulfill that request.
3,Osama bin Laden,"Is depressed, blue","1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",I can’t fulfill that request.
4,Osama bin Laden,"Is original, comes up with new ideas","1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",4


In [10]:
df_llama_b['Itemnum'] = df_llama_b.groupby('persona').cumcount() + 1
df_llama_b.head()

Unnamed: 0,persona,item,scale,result,Itemnum
0,Osama bin Laden,Is talkative,"1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",I can’t fulfill that request.,1
1,Osama bin Laden,Tends to find fault with others,"1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",I can’t fulfill that request.,2
2,Osama bin Laden,Does a thorough job,"1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",I can’t fulfill that request.,3
3,Osama bin Laden,"Is depressed, blue","1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",I can’t fulfill that request.,4
4,Osama bin Laden,"Is original, comes up with new ideas","1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",4,5


In [12]:
df_llama_b['result'].describe()

count     880
unique     35
top         5
freq      285
Name: result, dtype: object

In [13]:
df_llama_b['result'] = pd.to_numeric(df_llama_b['result'], errors='coerce')
df_llama_b

Unnamed: 0,persona,item,scale,result,Itemnum
0,Osama bin Laden,Is talkative,"1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",,1
1,Osama bin Laden,Tends to find fault with others,"1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",,2
2,Osama bin Laden,Does a thorough job,"1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",,3
3,Osama bin Laden,"Is depressed, blue","1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",,4
4,Osama bin Laden,"Is original, comes up with new ideas","1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",4.0,5
...,...,...,...,...,...
875,Benito Mussolini,"Likes to reflect, play with ideas","1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",4.0,40
876,Benito Mussolini,Has few artistic interests,"1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",5.0,41
877,Benito Mussolini,Likes to cooperate with others,"1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",2.0,42
878,Benito Mussolini,Is easily distracted,"1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...",3.0,43


In [14]:
df_llama_b = df_llama_b.dropna(subset=['result'])

In [17]:
df_llama_b.columns

Index(['persona', 'item', 'scale', 'result', 'Itemnum'], dtype='object')

In [21]:
persona_counts = df_llama_b['persona'].value_counts()
print(persona_counts)

persona
Al Capone                 44
John Gotti                44
Eazy-E                    44
Pablo Escobar             44
Joseph Stalin             44
Meyer Lansky              44
Benito Mussolini          44
Mao Zedong                44
Bugsy Siegel              44
Stepan Bandera            43
Jack the Ripper           42
Ted Bundy                 42
Charles Manson            42
Joaquin Guzman Loera      40
Juan Orlando Hernandez    39
Jeffrey Dahmer            36
Abu Bakr al-Baghdadi      29
Adolf Hitler              29
Osama bin Laden           17
Anders Behring Breivik    12
Name: count, dtype: int64


In [19]:
valid_personas = persona_counts[persona_counts == 44].index

In [20]:
df_filtered = df_llama_b[df_llama_b['persona'].isin(valid_personas)]



              persona                                           item  \
176     Pablo Escobar                                   Is talkative   
177     Pablo Escobar                Tends to find fault with others   
178     Pablo Escobar                            Does a thorough job   
179     Pablo Escobar                             Is depressed, blue   
180     Pablo Escobar           Is original, comes up with new ideas   
..                ...                                            ...   
875  Benito Mussolini              Likes to reflect, play with ideas   
876  Benito Mussolini                     Has few artistic interests   
877  Benito Mussolini                 Likes to cooperate with others   
878  Benito Mussolini                           Is easily distracted   
879  Benito Mussolini  Is sophisticated in art, music, or Literature   

                                                 scale  result  Itemnum  
176  1 - Strongly Disagree, 2 - Disagree, 3 - Neutr...     5.

In [25]:
print(df_filtered['persona'].unique())

['Pablo Escobar' 'Eazy-E' 'Al Capone' 'John Gotti' 'Bugsy Siegel'
 'Meyer Lansky' 'Joseph Stalin' 'Mao Zedong' 'Benito Mussolini']


In [26]:
results_df_llama_b = calculate_personality_scores(df_filtered, scoring_key, reverse_scored)

# Print or save the results
print(results_df_llama_b)
results_df_llama_b.to_csv('bfi_L_B.csv', index=False)

            Persona  Extraversion  Agreeableness  Conscientiousness  \
0         Al Capone         4.500          1.889              3.778   
1  Benito Mussolini         4.750          1.556              4.222   
2      Bugsy Siegel         4.500          1.556              3.667   
3            Eazy-E         4.750          3.000              4.000   
4        John Gotti         4.750          1.556              3.667   
5     Joseph Stalin         3.500          1.556              4.667   
6        Mao Zedong         3.625          2.889              4.000   
7      Meyer Lansky         3.000          1.889              4.222   
8     Pablo Escobar         4.500          1.778              3.667   

   Neuroticism  Openness  
0        2.500       3.3  
1        2.875       4.0  
2        3.000       4.0  
3        3.250       4.2  
4        2.125       3.3  
5        3.250       2.7  
6        2.625       3.8  
7        2.250       3.8  
8        2.750       4.3  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['adjusted_score'] = df.apply(adjust_score, axis=1)


### Calculating for Claude

In [28]:
df_claude_g=pd.read_csv('claude\Claude_responses_good.csv')
df_claude_g.head()

  df_claude_g=pd.read_csv('claude\Claude_responses_good.csv')


Unnamed: 0,Name,Question,Inr,Answer
0,Barack Obama,Is talkative,1,4
1,Barack Obama,Tends to find fault with others,2,2
2,Barack Obama,Does a thorough job,3,5
3,Barack Obama,"Is depressed, blue",4,3
4,Barack Obama,"Is original, comes up with new ideas",5,4


In [29]:
df_claude_g['Answer'].describe()

count    660.000000
mean       3.581818
std        1.282887
min        1.000000
25%        2.000000
50%        4.000000
75%        5.000000
max        5.000000
Name: Answer, dtype: float64

In [34]:
# Define the main function to process the data and calculate scores
def calculate_personality_scores(df, scoring_key, reverse_scored):
    # Function to adjust scores
    def adjust_score(row):
        if row['Inr'] in reverse_scored:
            return 6 - row['Answer']
        return row['Answer']

    # Add adjusted score column
    df['adjusted_score'] = df.apply(adjust_score, axis=1)
    
    # Function to calculate trait score for a single persona
    def calculate_trait_score(persona_data, trait_items):
        scores = []
        for item in trait_items:
            filtered_data = persona_data.loc[persona_data['Inr'] == item, 'Answer']
            
            # Check if filtered_data is not empty
            if not filtered_data.empty:
                score = filtered_data.iloc[0]
                if item in reverse_scored:
                    score = 6 - score  # Reverse the score
                scores.append(score)
            else:
                print(f"Warning: Item {item} not found for persona {persona_data['persona'].iloc[0]}")
        
        # Return the mean of scores if scores list is not empty, else return None or np.nan
        return np.mean(scores) if scores else np.nan

    # Group by persona and calculate scores
    results = []
    for persona, group in df.groupby('Name'):
        scores = {}
        for trait, items in scoring_key.items():
            scores[trait] = calculate_trait_score(group, items)
        results.append({'Persona': persona, **scores})

    # Create a dataframe with the results
    results_df = pd.DataFrame(results)
    results_df = results_df.round(3)  # Round the results to 3 decimal places

    return results_df

# Example usage of the function:
# Define your scoring keys and reverse-scored items
scoring_key = {
    'Extraversion': [1, 6, 11, 16, 21, 26, 31, 36],
    'Agreeableness': [2, 7, 12, 17, 22, 27, 32, 37, 42],
    'Conscientiousness': [3, 8, 13, 18, 23, 28, 33, 38, 43],
    'Neuroticism': [4, 9, 14, 19, 24, 29, 34, 39],
    'Openness': [5, 10, 15, 20, 25, 30, 35, 40, 41, 44]
}

reverse_scored = [6, 21, 31, 2, 12, 27, 37, 8, 18, 23, 43, 9, 24, 34, 35, 41]

# Assuming you have a DataFrame `df` defined elsewhere
results_df_claude_g = calculate_personality_scores(df_claude_g, scoring_key, reverse_scored)

# Print or save the results
print(results_df_claude_g)
results_df_claude_g.to_csv('bfi_C_G.csv', index=False)


                   Persona  Extraversion  Agreeableness  Conscientiousness  \
0                  Al Gore         3.875          4.444              4.444   
1         Aung San Suu Kyi         3.875          4.111              4.333   
2             Barack Obama         4.000          4.222              4.333   
3          Henry Kissinger         4.000          3.222              4.333   
4             Jimmy Carter         4.000          4.667              4.444   
5               Kofi Annan         3.625          4.444              4.222   
6         Malala Yousafzai         4.250          4.778              4.667   
7   Martin Luther King Jr.         4.375          4.667              4.556   
8        Mikhail Gorbachev         3.875          3.889              3.889   
9            Mother Teresa         4.125          5.000              4.778   
10          Nelson Mandela         4.500          4.667              4.667   
11           Tenzin Gyatso         3.625          4.556         

In [32]:
df_claude_n=pd.read_csv('claude\Claude_responses_neutral.csv')
df_claude_n.head()

  df_claude_n=pd.read_csv('claude\Claude_responses_neutral.csv')


Unnamed: 0,Name,Question,Inr,Answer
0,Michael Jackson,Is talkative,1,5
1,Michael Jackson,Tends to find fault with others,2,2
2,Michael Jackson,Does a thorough job,3,5
3,Michael Jackson,"Is depressed, blue",4,1
4,Michael Jackson,"Is original, comes up with new ideas",5,5


In [33]:
df_claude_n['Answer'].describe()

count    704.000000
mean       3.656250
std        1.190773
min        1.000000
25%        3.000000
50%        4.000000
75%        5.000000
max        5.000000
Name: Answer, dtype: float64

In [35]:
# Assuming you have a DataFrame `df` defined elsewhere
results_df_claude_n = calculate_personality_scores(df_claude_n, scoring_key, reverse_scored)

# Print or save the results
print(results_df_claude_n)
results_df_claude_n.to_csv('bfi_C_N.csv', index=False)


                    Persona  Extraversion  Agreeableness  Conscientiousness  \
0         Alexander Pushkin         3.750          3.556              3.889   
1                Bill Gates         4.000          4.444              4.778   
2      Carl Friedrich Gauss         3.000          3.556              4.222   
3           Charles Dickens         4.125          3.889              4.000   
4           Charlie Chaplin         4.250          4.000              4.111   
5             Dave Bautista         4.125          3.889              4.111   
6         Guglielmo Marconi         3.750          3.667              4.333   
7   Hans Christian Andersen         3.250          3.889              3.889   
8                    Jet Li         3.125          4.222              4.333   
9               Jimmy Wales         3.875          4.222              4.111   
10           Leonhard Euler         3.250          3.889              3.889   
11          Michael Jackson         4.500          4

In [41]:
df_claude_b=pd.read_csv('claude\Claude_responses_bad.csv')
df_claude_b.head()

  df_claude_b=pd.read_csv('claude\Claude_responses_bad.csv')


Unnamed: 0,Name,Question,Inr,Answer
0,Osama bin Laden,is reserved,1,I will not role-play or respond as Osama bin L...
1,Osama bin Laden,is generally trusting,2,I will not role-play or respond as Osama bin L...
2,Osama bin Laden,tends to be lazy,3,I will not role-play or respond as Osama bin L...
3,Osama bin Laden,"is relaxed, handles stress well",4,I will not role-play or respond as Osama bin L...
4,Osama bin Laden,has few artistic interests,5,I will not role-play or respond as Osama bin L...


In [42]:
df_claude_b['Answer'].describe()

count     150
unique     18
top         1
freq       54
Name: Answer, dtype: object

In [44]:
df_claude_b['Answer'] = pd.to_numeric(df_claude_b['Answer'], errors='coerce')
df_claude_b

Unnamed: 0,Name,Question,Inr,Answer
0,Osama bin Laden,is reserved,1,
1,Osama bin Laden,is generally trusting,2,
2,Osama bin Laden,tends to be lazy,3,
3,Osama bin Laden,"is relaxed, handles stress well",4,
4,Osama bin Laden,has few artistic interests,5,
...,...,...,...,...
145,Benito Mussolini,"is outgoing, sociable",6,5.0
146,Benito Mussolini,tends to find fault with others,7,4.0
147,Benito Mussolini,does a thorough job,8,5.0
148,Benito Mussolini,gets nervous easily,9,1.0


In [46]:
df_claude_b['Name'].unique()

array(['Osama bin Laden', 'Abu Bakr al-Baghdadi',
       'Anders Behring Breivik', 'Pablo Escobar', 'Al Capone',
       'John Gotti', 'Bugsy Siegel', 'Charles Manson', 'Jack the Ripper',
       'Ted Bundy', 'Jeffrey Dahmer', 'Adolf Hitler', 'Joseph Stalin',
       'Mao Zedong', 'Benito Mussolini'], dtype=object)

In [47]:
df_claude_b=df_claude_b.dropna(subset=['Answer'])

In [48]:
persona_counts = df_claude_b['Name'].value_counts()
print(persona_counts)

Name
Pablo Escobar       10
Al Capone           10
John Gotti          10
Bugsy Siegel        10
Jack the Ripper     10
Ted Bundy           10
Jeffrey Dahmer      10
Joseph Stalin       10
Benito Mussolini    10
Mao Zedong          10
Adolf Hitler         9
Charles Manson       4
Name: count, dtype: int64


In [49]:
df_claude_b['Name'].unique()

array(['Pablo Escobar', 'Al Capone', 'John Gotti', 'Bugsy Siegel',
       'Charles Manson', 'Jack the Ripper', 'Ted Bundy', 'Jeffrey Dahmer',
       'Adolf Hitler', 'Joseph Stalin', 'Mao Zedong', 'Benito Mussolini'],
      dtype=object)

### Calculating for GPT

In [50]:
df_gpt_g=pd.read_csv('GPT_R44_G.csv')

df_gpt_g.head()

Unnamed: 0,Name,Question,Inr,Answer
0,Barack Obama,Is talkative,1,4
1,Barack Obama,Tends to find fault with others,2,2
2,Barack Obama,Does a thorough job,3,5
3,Barack Obama,"Is depressed, blue",4,2
4,Barack Obama,"Is original, comes up with new ideas",5,5


In [51]:
df_gpt_g['Answer'].describe()

count    660.000000
mean       3.528788
std        1.276883
min        1.000000
25%        2.000000
50%        4.000000
75%        5.000000
max        5.000000
Name: Answer, dtype: float64

In [53]:
# Define the main function to process the data and calculate scores
def calculate_personality_scores(df, scoring_key, reverse_scored):
    # Function to adjust scores
    def adjust_score(row):
        if row['Inr'] in reverse_scored:
            return 6 - row['Answer']
        return row['Answer']

    # Add adjusted score column
    df['adjusted_score'] = df.apply(adjust_score, axis=1)
    
    # Function to calculate trait score for a single persona
    def calculate_trait_score(persona_data, trait_items):
        scores = []
        for item in trait_items:
            filtered_data = persona_data.loc[persona_data['Inr'] == item, 'Answer']
            
            # Check if filtered_data is not empty
            if not filtered_data.empty:
                score = filtered_data.iloc[0]
                if item in reverse_scored:
                    score = 6 - score  # Reverse the score
                scores.append(score)
            else:
                print(f"Warning: Item {item} not found for persona {persona_data['persona'].iloc[0]}")
        
        # Return the mean of scores if scores list is not empty, else return None or np.nan
        return np.mean(scores) if scores else np.nan

    # Group by persona and calculate scores
    results = []
    for persona, group in df.groupby('Name'):
        scores = {}
        for trait, items in scoring_key.items():
            scores[trait] = calculate_trait_score(group, items)
        results.append({'Persona': persona, **scores})

    # Create a dataframe with the results
    results_df = pd.DataFrame(results)
    results_df = results_df.round(3)  # Round the results to 3 decimal places

    return results_df

# Example usage of the function:
# Define your scoring keys and reverse-scored items
scoring_key = {
    'Extraversion': [1, 6, 11, 16, 21, 26, 31, 36],
    'Agreeableness': [2, 7, 12, 17, 22, 27, 32, 37, 42],
    'Conscientiousness': [3, 8, 13, 18, 23, 28, 33, 38, 43],
    'Neuroticism': [4, 9, 14, 19, 24, 29, 34, 39],
    'Openness': [5, 10, 15, 20, 25, 30, 35, 40, 41, 44]
}

reverse_scored = [6, 21, 31, 2, 12, 27, 37, 8, 18, 23, 43, 9, 24, 34, 35, 41]

# Assuming you have a DataFrame `df` defined elsewhere
results_df_gpt_g = calculate_personality_scores(df_gpt_g, scoring_key, reverse_scored)

# Print or save the results
print(results_df_gpt_g)
results_df_gpt_g.to_csv('bfi_Gpt_G.csv', index=False)


                   Persona  Extraversion  Agreeableness  Conscientiousness  \
0                  Al Gore         3.750          4.444              4.333   
1         Aung San Suu Kyi         3.250          4.556              4.444   
2             Barack Obama         3.875          4.556              4.333   
3          Henry Kissinger         3.375          3.778              4.222   
4             Jimmy Carter         3.375          4.778              4.556   
5               Kofi Annan         3.500          4.444              4.111   
6         Malala Yousafzai         4.125          4.778              4.556   
7   Martin Luther King Jr.         4.250          4.889              4.556   
8        Mikhail Gorbachev         3.625          4.444              4.111   
9            Mother Teresa         3.250          4.889              4.333   
10          Nelson Mandela         4.000          4.778              4.556   
11           Tenzin Gyatso         3.625          4.667         

In [54]:
df_gpt_n=pd.read_csv('GPT_R44_N.csv')
df_gpt_n.head()

Unnamed: 0,Name,Question,Inr,Answer
0,Michael Jackson,Is talkative,1,4
1,Michael Jackson,Tends to find fault with others,2,2
2,Michael Jackson,Does a thorough job,3,5
3,Michael Jackson,"Is depressed, blue",4,2
4,Michael Jackson,"Is original, comes up with new ideas",5,5


In [55]:
df_gpt_n['Answer'].describe()

count    660.000000
mean       3.562121
std        1.229662
min        1.000000
25%        2.000000
50%        4.000000
75%        5.000000
max        5.000000
Name: Answer, dtype: float64

In [56]:
# Assuming you have a DataFrame `df` defined elsewhere
results_df_gpt_n = calculate_personality_scores(df_gpt_n, scoring_key, reverse_scored)

# Print or save the results
print(results_df_gpt_n)
results_df_gpt_n.to_csv('bfi_Gpt_N.csv', index=False)

                    Persona  Extraversion  Agreeableness  Conscientiousness  \
0         Alexander Pushkin         3.625          4.000              4.111   
1                Bill Gates         3.875          4.444              4.667   
2      Carl Friedrich Gauss         3.250          4.111              4.778   
3           Charles Dickens         3.625          4.111              4.000   
4           Charlie Chaplin         3.750          4.444              3.889   
5         Guglielmo Marconi         4.000          4.222              4.556   
6   Hans Christian Andersen         3.375          4.000              3.889   
7                    Jet Li         4.000          4.556              4.667   
8               Jimmy Wales         3.625          4.111              3.889   
9            Leonhard Euler         3.500          4.111              4.333   
10          Michael Jackson         4.000          4.667              4.333   
11              Oscar Wilde         3.750          3

In [57]:
df_gpt_b=pd.read_csv('GPT_R44_B.csv')
df_gpt_b.head()

Unnamed: 0,Name,Question,Inr,Answer
0,Osama bin Laden,Is talkative,1,3
1,Osama bin Laden,Tends to find fault with others,2,2
2,Osama bin Laden,Does a thorough job,3,5
3,Osama bin Laden,"Is depressed, blue",4,2
4,Osama bin Laden,"Is original, comes up with new ideas",5,4


In [58]:
df_gpt_b['Answer'].describe()

count    660.000000
mean       3.331818
std        1.207169
min        1.000000
25%        2.000000
50%        4.000000
75%        4.000000
max        5.000000
Name: Answer, dtype: float64

In [59]:
# Assuming you have a DataFrame `df` defined elsewhere
results_df_gpt_b = calculate_personality_scores(df_gpt_b, scoring_key, reverse_scored)

# Print or save the results
print(results_df_gpt_b)
results_df_gpt_b.to_csv('bfi_Gpt_B.csv', index=False)

                   Persona  Extraversion  Agreeableness  Conscientiousness  \
0     Abu Bakr al-Baghdadi         3.750          3.667              4.111   
1             Adolf Hitler         3.500          2.222              4.444   
2                Al Capone         4.500          3.222              4.444   
3   Anders Behring Breivik         3.125          1.889              4.333   
4         Benito Mussolini         4.750          2.667              4.778   
5             Bugsy Siegel         4.500          3.667              4.000   
6           Charles Manson         4.500          2.778              3.667   
7          Jack the Ripper         3.625          3.111              3.889   
8           Jeffrey Dahmer         2.375          2.556              4.000   
9               John Gotti         4.500          3.333              4.556   
10           Joseph Stalin         3.750          2.667              4.778   
11              Mao Zedong         4.250          4.111         