#### no response query in baidu


In [1]:
# import pandas
import pandas as pd

In [3]:
# read in the dictionary
data = pd.read_csv('../data/Groups_finnaly.csv')

In [12]:
# Initialize DataFrame and template list
templates = ["为什么待替换词汇这么", "待替换词汇为什么这么"]
df = pd.DataFrame(columns=['query', 'category', 'group', 'English'])

# Group translations by category and convert to dictionary
translation_dict = data.groupby('Category')['Translation'].apply(list).to_dict()

# Assuming you have a column 'English' in the 'data' DataFrame that maps each group to its English equivalent
group_to_english = data.set_index('Translation')['English'].to_dict()

# Build DataFrame using list comprehension
rows = [
    {
        'query': template.replace("待替换词汇", group).replace(" ", "+"),
        'category': category,
        'group': group,
        'English': group_to_english.get(group, 'Unknown')  # Get English equivalent from mapping
    }
    for category, groups in translation_dict.items()
    for group in groups
    for template in templates
]
df = pd.DataFrame(rows)

def check_unresponded_queries(main_df, comparison_df):
    """
    Check which queries from main_df did not get a response in comparison_df.

    :param main_df: DataFrame with 'query' column
    :param comparison_df: DataFrame with 'query' column to compare against
    :return: DataFrame with queries from main_df not present in comparison_df
    """
    unresponded_queries = set(main_df['query']) - set(comparison_df['query'])
    return main_df[main_df['query'].isin(unresponded_queries)].drop_duplicates('query').reset_index(drop=True)

# Assuming baidu_df represents the response data from Baidu
unresponded_queries_baidu = check_unresponded_queries(df, baidu)
unresponded_queries_baidu

Unnamed: 0,query,category,group,English
0,为什么婴儿潮一代这么,Age,婴儿潮一代,boomers
1,婴儿潮一代为什么这么,Age,婴儿潮一代,boomers
2,为什么继母这么,Gender,继母,stepmothers
3,继母为什么这么,Gender,继母,stepmothers
4,为什么先生们这么,Gender,先生们,gentlemen
5,先生们为什么这么,Gender,先生们,gentlemen
6,为什么潮人这么,Lifestyle,潮人,hipsters
7,潮人为什么这么,Lifestyle,潮人,hipsters
8,为什么哥伦比亚人这么,Nationalities,哥伦比亚人,Colombians
9,为什么金发女郎这么,Peoples,金发女郎,blondes


In [5]:
# Check for unresponded queries in Baidu
baidu = pd.read_csv('../data/baidu/baidu_2023-11-26.csv')
unresponded_queries_baidu = check_unresponded_queries(df, baidu)


Unnamed: 0,query,category,group
0,为什么婴儿潮一代这么,Age,婴儿潮一代
1,婴儿潮一代为什么这么,Age,婴儿潮一代
2,为什么继母这么,Gender,继母
3,继母为什么这么,Gender,继母
4,为什么先生们这么,Gender,先生们
5,先生们为什么这么,Gender,先生们
6,为什么潮人这么,Lifestyle,潮人
7,潮人为什么这么,Lifestyle,潮人
8,为什么哥伦比亚人这么,Nationalities,哥伦比亚人
9,为什么金发女郎这么,Peoples,金发女郎


In [6]:
unresponded_queries_baidu.head(25)

Unnamed: 0,query,category,group
0,为什么婴儿潮一代这么,Age,婴儿潮一代
1,婴儿潮一代为什么这么,Age,婴儿潮一代
2,为什么继母这么,Gender,继母
3,继母为什么这么,Gender,继母
4,为什么先生们这么,Gender,先生们
5,先生们为什么这么,Gender,先生们
6,为什么潮人这么,Lifestyle,潮人
7,潮人为什么这么,Lifestyle,潮人
8,为什么哥伦比亚人这么,Nationalities,哥伦比亚人
9,为什么金发女郎这么,Peoples,金发女郎


#### no response query in Western search engine

In [7]:
google = pd.read_csv('../data/google/google_2023-11-26.csv')
yahoo = pd.read_csv('../data/yahoo/yahoo_2023-11-26.csv')
duckduckgo = pd.read_csv('../data/duckduckgo/duckduckgo_2023-11-26.csv')

In [8]:
# Group translations by category and convert to dictionary
translation_dict = data.groupby('Category')['English'].apply(list).to_dict()
# Initialize an empty DataFrame
df_original = pd.DataFrame(columns=['query', 'category', 'group'])
templates = ["why are term_to_use so"]

# Construct DataFrame using list comprehension
rows = [
    {'query': template.replace("term_to_use", group), 'category': category, 'group': group}
    for category, groups in translation_dict.items()
    for group in groups
    for template in templates
]
df_original = pd.DataFrame(rows)

def check_unresponded_queries(main_df, comparison_df):
    """
    Check which queries from main_df did not get a response in comparison_df.

    :param main_df: DataFrame with 'query' column
    :param comparison_df: DataFrame with 'query' column to compare against
    :return: DataFrame with queries from main_df not present in comparison_df
    """
    unresponded_queries = set(main_df['query']) - set(comparison_df['query'])
    return main_df[main_df['query'].isin(unresponded_queries)].drop_duplicates('query').reset_index(drop=True)


In [9]:
# Check for unresponded queries in various search engines
unresponded_queries_google = check_unresponded_queries(df_original, google)
unresponded_queries_yahoo = check_unresponded_queries(df_original, yahoo)
unresponded_queries_duckduckgo = check_unresponded_queries(df_original, duckduckgo)

Unresponded Queries in Google:                           query            category             group
0            why are old men so                 Age           old men
1          why are old women so                 Age         old women
2               why are boys so              Gender              boys
3             why are ladies so              Gender            ladies
4         why are schoolboys so              Gender        schoolboys
..                          ...                 ...               ...
66        why are gay people so  Sexual Orientation        gay people
67       why are homosexuals so  Sexual Orientation       homosexuals
68          why are lesbians so  Sexual Orientation          lesbians
69  why are pansexual people so  Sexual Orientation  pansexual people
70      why are queer people so  Sexual Orientation      queer people

[71 rows x 3 columns]
Unresponded Queries in Yahoo:                           query            category             group
0     

#### save the unresponded query


Saved: ../data/unresponded/google_unresponded.csv
Saved: ../data/unresponded/yahoo_unresponded.csv
Saved: ../data/unresponded/duckduckgo_unresponded.csv
Saved: ../data/unresponded/baidu_unresponded.csv
