### Top N concepts of articles written by asian and non-asian

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

ms = pd.read_csv("MS_result.csv")
nms = pd.read_csv("NMS_result.csv")

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def get_top_n(data: pd.DataFrame, from_asian: bool, n: int=10) -> dict:
    """
    Note
        data should have columns named by "Key", "Value from Asian", and "Value from NonAsian"
    Return
        dictionary with keys are in "Key", values are amount of paper which related to topic
    """
    col = "Value from Asian" if from_asian else "Value from NonAsian"
    temp = (sorted_data:=data.sort_values(by=[col], ascending=False).head(n))[col]
    denominater = sum(data[col])
    keys = sorted_data["Key"]
    res = {topic.strip(): round(counter / denominater, 4) for topic, counter in zip(keys, temp)}
    
    return res

In [4]:
asian = get_top_n(nms, True)
non_asian = get_top_n(nms, False)

"""Correlation coefficient for same topic in MS and NMS"""
keys = set(asian.keys()).intersection(set(non_asian.keys()))
x = [asian[key] for key in keys]
y = [non_asian[key] for key in keys]
print(f"corr. coef. between intersection: {np.corrcoef(x, y)}")

corr. coef. between intersection: [[1.         0.92735375]
 [0.92735375 1.        ]]


In [5]:
"""Correlation coefficient for same topic in MS and NMS"""
keys = list(set(asian.keys()).union(set(non_asian.keys())))
sum_asian = sum(nms["Value from Asian"])
sum_nonasian = sum(nms["Value from NonAsian"])
x = nms[nms["Key"].isin(keys)]["Value from Asian"].to_numpy() / sum_asian
y = nms[nms["Key"].isin(keys)]["Value from NonAsian"].to_numpy() / sum_nonasian
print(f"corr. coef. between union: {np.corrcoef(x, y)}")
a = nms[nms["Key"].isin(keys)]
a["Value from Asian"] = (a.loc[:, "Value from Asian"] / sum_asian).apply(lambda x: round(x, 4)).to_list()
a["Value from NonAsian"] = (a.loc[:, "Value from NonAsian"] / sum_nonasian).apply(lambda x: round(x, 4)).to_list()
a = a.sort_values(by=["Value from Asian", "Value from NonAsian"], ascending=False)
a

corr. coef. between union: [[1.         0.83502049]
 [0.83502049 1.        ]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a["Value from Asian"] = (a.loc[:, "Value from Asian"] / sum_asian).apply(lambda x: round(x, 4)).to_list()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a["Value from NonAsian"] = (a.loc[:, "Value from NonAsian"] / sum_nonasian).apply(lambda x: round(x, 4)).to_list()


Unnamed: 0,Key,Value from Asian,Value from NonAsian
73,Racism (general),0.1049,0.1263
70,Anti-Asian hate crimes,0.0861,0.0909
4,COVID-19 or coronavirus or pandemic,0.0599,0.0606
17,Worry about safety,0.0599,0.0404
59,Verbal harassment,0.0562,0.0581
64,“China/Chinese virus” or “Kung flu/plague” or ...,0.0524,0.0202
10,Perpetual foreigner (or forever foreigner or g...,0.0449,0.0177
55,"Racism (gendered, misogynistic) or racism towa...",0.0375,0.048
2,Impacts of racism on AA population (unemployme...,0.03,0.0227
48,Stop AAPI Hate,0.015,0.0303


In [6]:
keys = set(asian.keys()).symmetric_difference(set(non_asian.keys()))
sum_asian = sum(nms["Value from Asian"])
sum_nonasian = sum(nms["Value from NonAsian"])
x = nms[nms["Key"].isin(keys)]["Value from Asian"].to_numpy() / sum_asian
y = nms[nms["Key"].isin(keys)]["Value from NonAsian"].to_numpy() / sum_nonasian
print(f"corr. coef. between xor: {np.corrcoef(x, y)}")
print(np.std(x))
print(np.std(y))
a = nms[nms["Key"].isin(keys)]
a["Value from Asian"] = (a.loc[:, "Value from Asian"] / sum_asian).apply(lambda x: round(x, 4)).to_list()
a["Value from NonAsian"] = (a.loc[:, "Value from NonAsian"] / sum_nonasian).apply(lambda x: round(x, 4)).to_list()
a = a.sort_values(by=["Value from Asian", "Value from NonAsian"], ascending=False)
a


corr. coef. between xor: [[ 1.         -0.93788893]
 [-0.93788893  1.        ]]
0.016135325264822482
0.005199813202518688


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a["Value from Asian"] = (a.loc[:, "Value from Asian"] / sum_asian).apply(lambda x: round(x, 4)).to_list()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a["Value from NonAsian"] = (a.loc[:, "Value from NonAsian"] / sum_nonasian).apply(lambda x: round(x, 4)).to_list()


Unnamed: 0,Key,Value from Asian,Value from NonAsian
64,“China/Chinese virus” or “Kung flu/plague” or ...,0.0524,0.0202
10,Perpetual foreigner (or forever foreigner or g...,0.0449,0.0177
2,Impacts of racism on AA population (unemployme...,0.03,0.0227
48,Stop AAPI Hate,0.015,0.0303
29,Color blind/minimizing racism,0.0112,0.0303


In [7]:
a.set_index("Key", inplace=True)
a.columns = ["Asian", "Non Asian"]
print(a.to_markdown())

| Key                                                                                                                                                                                                                                                                                                                             |   Asian |   Non Asian |
|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------:|------------:|
| “China/Chinese virus” or “Kung flu/plague” or “Wuhan virus/plague” or “Diseased Chinese” or “Asian Virus” or “Ramen Noodle flu”                                                                                                                                                                               

In [8]:
asian, non_asian

({'Racism (general)': 0.1049,
  'Anti-Asian violence/ assaults/attacks': 0.1049,
  'Anti-Asian hate crimes': 0.0861,
  'Worry about safety': 0.0599,
  'COVID-19 or coronavirus or pandemic': 0.0599,
  'Verbal harassment': 0.0562,
  '“China/Chinese virus” or “Kung flu/plague” or “Wuhan virus/plague” or “Diseased Chinese” or “Asian Virus” or “Ramen Noodle flu”': 0.0524,
  'Perpetual foreigner (or forever foreigner or go back to China or You don’t belong to here)': 0.0449,
  'Racism (gendered, misogynistic) or racism toward Asian American women)': 0.0375,
  'Impacts of racism on AA population (unemployment, mental health problems, trauma, PTSD, Asian Target Anxiety Syndrome,\xa0 negative emotions (sadness), distrust, isolating self, Asian-based business had poor business or was worried about being attacked, child well-being, increasingly purchasing guns to defend themselves)': 0.03},
 {'Racism (general)': 0.1263,
  'Anti-Asian violence/ assaults/attacks': 0.1035,
  'Anti-Asian hate crimes'

In [9]:
df = pd.DataFrame([non_asian]).T
df.columns = ["NonAsian"]
print(df.to_markdown())

|                                                                        |   NonAsian |
|:-----------------------------------------------------------------------|-----------:|
| Racism (general)                                                       |     0.1263 |
| Anti-Asian violence/ assaults/attacks                                  |     0.1035 |
| Anti-Asian hate crimes                                                 |     0.0909 |
| COVID-19 or coronavirus or pandemic                                    |     0.0606 |
| Verbal harassment                                                      |     0.0581 |
| Racism (gendered, misogynistic) or racism toward Asian American women) |     0.048  |
| Discrimination                                                         |     0.0455 |
| Worry about safety                                                     |     0.0404 |
| Stop AAPI Hate                                                         |     0.0303 |
| Color blind/minimizing racism 