In [9]:
import pandas as pd
import statsmodels.api as sm

# Load the datasets
file_paths = {
    "Baidu Template 1": '../data/temp/modified_baidu_follow_query_template_1.csv',
    "Baidu Template 2": '../data/temp/modified_baidu_follow_query_template_2.csv',
    "DuckDuckGo": '../data/temp/modified_duckduckgo_follow_query.csv',
    "Google": '../data/temp/modified_google_follow_query.csv',
    "Yahoo": '../data/temp/modified_yahoo_follow_query.csv'
}


dfs = {}

# Process each file
for engine, path in file_paths.items():
    df = pd.read_csv(path)
    # Convert 'common_sentiment_rating' to numeric, treating non-numeric as NaN
    df['common_sentiment_rating'] = pd.to_numeric(df['common_sentiment_rating'], errors='coerce')
    dfs[engine] = df

# Function to compare Baidu Template 1 with other search engines for each category using Chi-square test
def compare_baidu_template_1(dfs):
    baidu_engine = "Baidu Template 1"
    other_engines = ["Google", "Yahoo", "DuckDuckGo"]
    results = {}
    categories = set(dfs[baidu_engine]['category'].dropna())

    for category in categories:
        baidu_df = dfs[baidu_engine][dfs[baidu_engine]['category'] == category]

        for other_engine in other_engines:
            other_df = dfs[other_engine][dfs[other_engine]['category'] == category]

            # Count negative sentiment (common_sentiment_rating == 3) and others
            count_negative_baidu = baidu_df['common_sentiment_rating'].eq(3).sum()
            count_others_baidu = baidu_df['common_sentiment_rating'].count() - count_negative_baidu

            count_negative_other = other_df['common_sentiment_rating'].eq(3).sum()
            count_others_other = other_df['common_sentiment_rating'].count() - count_negative_other

            # Construct contingency table
            contingency_table = [[count_negative_baidu, count_others_baidu], 
                                 [count_negative_other, count_others_other]]

            # Perform Chi-square test
            test_result = sm.stats.Table2x2(contingency_table).test_nominal_association()
            chi2 = test_result.statistic
            p = test_result.pvalue

            # Store the results
            results[(baidu_engine, other_engine, category)] = {
                'chi2': chi2,
                'p-value': p
            }

    return results

# Comparing Baidu Template 1 with Google, Yahoo, and DuckDuckGo
comparison_results = compare_baidu_template_1(dfs)
comparison_results


{('Baidu Template 1', 'Google', 'Political'): {'chi2': 0.045351473922902605,
  'p-value': 0.8313590554579958},
 ('Baidu Template 1', 'Yahoo', 'Political'): {'chi2': 2.621011052978066,
  'p-value': 0.10545722738902075},
 ('Baidu Template 1', 'DuckDuckGo', 'Political'): {'chi2': 0.8997732426303856,
  'p-value': 0.3428425204030051},
 ('Baidu Template 1', 'Google', 'Nationalities'): {'chi2': 4.435288921497615,
  'p-value': 0.03520326096510962},
 ('Baidu Template 1', 'Yahoo', 'Nationalities'): {'chi2': 2.56107080506829,
  'p-value': 0.10952437690649497},
 ('Baidu Template 1',
  'DuckDuckGo',
  'Nationalities'): {'chi2': 15.781962605103512, 'p-value': 7.107690115060183e-05},
 ('Baidu Template 1', 'Google', 'Peoples'): {'chi2': 2.6395320970488534,
  'p-value': 0.10423423874507398},
 ('Baidu Template 1', 'Yahoo', 'Peoples'): {'chi2': 0.10383712284271884,
  'p-value': 0.7472726405847975},
 ('Baidu Template 1', 'DuckDuckGo', 'Peoples'): {'chi2': 5.877170360897984,
  'p-value': 0.0153384529494329

In [10]:
import pandas as pd
import statsmodels.api as sm

# Load the datasets
file_paths = {
    "Baidu Template 1": '../data/temp/modified_baidu_follow_query_template_1.csv',
    "Baidu Template 2": '../data/temp/modified_baidu_follow_query_template_2.csv',
    "DuckDuckGo": '../data/temp/modified_duckduckgo_follow_query.csv',
    "Google": '../data/temp/modified_google_follow_query.csv',
    "Yahoo": '../data/temp/modified_yahoo_follow_query.csv'
}

dfs = {}

# Process each file
for engine, path in file_paths.items():
    df = pd.read_csv(path)
    df['common_sentiment_rating'] = pd.to_numeric(df['common_sentiment_rating'], errors='coerce')
    dfs[engine] = df

def compare_baidu_template_1(dfs):
    baidu_engine = "Baidu Template 1"
    other_engines = ["Google", "Yahoo", "DuckDuckGo"]
    results = []
    categories = set(dfs[baidu_engine]['category'].dropna())

    for category in categories:
        baidu_df = dfs[baidu_engine][dfs[baidu_engine]['category'] == category]
        for other_engine in other_engines:
            other_df = dfs[other_engine][dfs[other_engine]['category'] == category]

            # Count negative sentiment and others
            count_negative_baidu = baidu_df['common_sentiment_rating'].eq(3).sum()
            count_others_baidu = baidu_df['common_sentiment_rating'].count() - count_negative_baidu
            count_negative_other = other_df['common_sentiment_rating'].eq(3).sum()
            count_others_other = other_df['common_sentiment_rating'].count() - count_negative_other

            # Construct contingency table and perform Chi-square test
            contingency_table = [[count_negative_baidu, count_others_baidu], [count_negative_other, count_others_other]]
            test_result = sm.stats.Table2x2(contingency_table).test_nominal_association()

            # Store the results in a list of dictionaries
            results.append({
                'Baidu Engine': baidu_engine,
                'Other Engine': other_engine,
                'Category': category,
                'Chi-Square': test_result.statistic,
                'P-Value': test_result.pvalue
            })

    return pd.DataFrame(results)

# Comparing Baidu Template 1 with other engines
comparison_results_df = compare_baidu_template_1(dfs)

# Displaying the results in a DataFrame
print(comparison_results_df)


        Baidu Engine Other Engine       Category  Chi-Square   P-Value
0   Baidu Template 1       Google      Political    0.045351  0.831359
1   Baidu Template 1        Yahoo      Political    2.621011  0.105457
2   Baidu Template 1   DuckDuckGo      Political    0.899773  0.342843
3   Baidu Template 1       Google  Nationalities    4.435289  0.035203
4   Baidu Template 1        Yahoo  Nationalities    2.561071  0.109524
5   Baidu Template 1   DuckDuckGo  Nationalities   15.781963  0.000071
6   Baidu Template 1       Google        Peoples    2.639532  0.104234
7   Baidu Template 1        Yahoo        Peoples    0.103837  0.747273
8   Baidu Template 1   DuckDuckGo        Peoples    5.877170  0.015338
9   Baidu Template 1       Google         Gender    0.001992  0.964403
10  Baidu Template 1        Yahoo         Gender    5.899989  0.015141
11  Baidu Template 1   DuckDuckGo         Gender    4.190879  0.040642
12  Baidu Template 1       Google            Age    0.811785  0.367593
13  Ba

In [11]:
import pandas as pd
import statsmodels.api as sm

# Load the datasets
file_paths = {
    "Baidu Template 1": '../data/temp/modified_baidu_follow_query_template_1.csv',
    "Baidu Template 2": '../data/temp/modified_baidu_follow_query_template_2.csv',
    "DuckDuckGo": '../data/temp/modified_duckduckgo_follow_query.csv',
    "Google": '../data/temp/modified_google_follow_query.csv',
    "Yahoo": '../data/temp/modified_yahoo_follow_query.csv'
}

dfs = {}

# Process each file
for engine, path in file_paths.items():
    df = pd.read_csv(path)
    df['common_sentiment_rating'] = pd.to_numeric(df['common_sentiment_rating'], errors='coerce')
    dfs[engine] = df

def compare_baidu_template(dfs, baidu_engine):
    other_engines = ["Google", "Yahoo", "DuckDuckGo"]
    results = []
    categories = set(dfs[baidu_engine]['category'].dropna())

    for category in categories:
        baidu_df = dfs[baidu_engine][dfs[baidu_engine]['category'] == category]
        for other_engine in other_engines:
            other_df = dfs[other_engine][dfs[other_engine]['category'] == category]

            # Count negative sentiment and others
            count_negative_baidu = baidu_df['common_sentiment_rating'].eq(3).sum()
            count_others_baidu = baidu_df['common_sentiment_rating'].count() - count_negative_baidu
            count_negative_other = other_df['common_sentiment_rating'].eq(3).sum()
            count_others_other = other_df['common_sentiment_rating'].count() - count_negative_other

            # Construct contingency table and perform Chi-square test
            contingency_table = [[count_negative_baidu, count_others_baidu], [count_negative_other, count_others_other]]
            test_result = sm.stats.Table2x2(contingency_table).test_nominal_association()

            # Store the results in a list of dictionaries
            results.append({
                'Baidu Engine': baidu_engine,
                'Other Engine': other_engine,
                'Category': category,
                'Chi-Square': test_result.statistic,
                'P-Value': test_result.pvalue
            })

    return pd.DataFrame(results)

# Comparing Baidu Template 1 with other engines
comparison_results_df_template1 = compare_baidu_template(dfs, "Baidu Template 1")
print(comparison_results_df_template1)

# Comparing Baidu Template 2 with other engines
comparison_results_df_template2 = compare_baidu_template(dfs, "Baidu Template 2")
print(comparison_results_df_template2)


        Baidu Engine Other Engine       Category  Chi-Square   P-Value
0   Baidu Template 1       Google      Political    0.045351  0.831359
1   Baidu Template 1        Yahoo      Political    2.621011  0.105457
2   Baidu Template 1   DuckDuckGo      Political    0.899773  0.342843
3   Baidu Template 1       Google  Nationalities    4.435289  0.035203
4   Baidu Template 1        Yahoo  Nationalities    2.561071  0.109524
5   Baidu Template 1   DuckDuckGo  Nationalities   15.781963  0.000071
6   Baidu Template 1       Google        Peoples    2.639532  0.104234
7   Baidu Template 1        Yahoo        Peoples    0.103837  0.747273
8   Baidu Template 1   DuckDuckGo        Peoples    5.877170  0.015338
9   Baidu Template 1       Google         Gender    0.001992  0.964403
10  Baidu Template 1        Yahoo         Gender    5.899989  0.015141
11  Baidu Template 1   DuckDuckGo         Gender    4.190879  0.040642
12  Baidu Template 1       Google            Age    0.811785  0.367593
13  Ba

In [12]:
import  pandas as pd
baidu_1 = pd.read_csv('../data/temp/modified_baidu_follow_query_template_1.csv')
baidu_1.columns

Index(['query', 'suggestion', 'category', 'group', 'completion',
       'suggestion_starts_with_query', 'extracted_suggestion',
       'suggestion_follows_query', 'sentiment_rating_completion',
       'sentiment_rating_full_suggestion', 'English',
       'common_sentiment_rating'],
      dtype='object')

In [13]:
import pandas as pd
import statsmodels.api as sm

# Load the datasets
file_paths = {
    "Baidu Template 1": '../data/temp/modified_baidu_follow_query_template_1.csv',
    "Baidu Template 2": '../data/temp/modified_baidu_follow_query_template_2.csv',
    "DuckDuckGo": '../data/temp/modified_duckduckgo_follow_query.csv',
    "Google": '../data/temp/modified_google_follow_query.csv',
    "Yahoo": '../data/temp/modified_yahoo_follow_query.csv'
}

dfs = {}

# Process each file
for engine, path in file_paths.items():
    df = pd.read_csv(path)
    df['sentiment_rating_full_suggestion'] = pd.to_numeric(df['sentiment_rating_full_suggestion'], errors='coerce')
    dfs[engine] = df

def compare_baidu_template(dfs, baidu_engine):
    other_engines = ["Google", "Yahoo", "DuckDuckGo"]
    results = []
    categories = set(dfs[baidu_engine]['category'].dropna())

    for category in categories:
        baidu_df = dfs[baidu_engine][dfs[baidu_engine]['category'] == category]
        for other_engine in other_engines:
            other_df = dfs[other_engine][dfs[other_engine]['category'] == category]

            # Count negative sentiment and others
            count_negative_baidu = baidu_df['sentiment_rating_full_suggestion'].eq(3).sum()
            count_others_baidu = baidu_df['sentiment_rating_full_suggestion'].count() - count_negative_baidu
            count_negative_other = other_df['sentiment_rating_full_suggestion'].eq(3).sum()
            count_others_other = other_df['sentiment_rating_full_suggestion'].count() - count_negative_other

            # Construct contingency table and perform Chi-square test
            contingency_table = [[count_negative_baidu, count_others_baidu], [count_negative_other, count_others_other]]
            test_result = sm.stats.Table2x2(contingency_table).test_nominal_association()

            # Store the results in a list of dictionaries
            results.append({
                'Baidu Engine': baidu_engine,
                'Other Engine': other_engine,
                'Category': category,
                'Chi-Square': test_result.statistic,
                'P-Value': test_result.pvalue
            })

    return pd.DataFrame(results)

# Comparing Baidu Template 1 with other engines
comparison_results_df_template1 = compare_baidu_template(dfs, "Baidu Template 1")
print(comparison_results_df_template1)

# Comparing Baidu Template 2 with other engines
comparison_results_df_template2 = compare_baidu_template(dfs, "Baidu Template 2")
print(comparison_results_df_template2)


        Baidu Engine Other Engine       Category  Chi-Square       P-Value
0   Baidu Template 1       Google      Political    1.632653  2.013365e-01
1   Baidu Template 1        Yahoo      Political    1.152517  2.830229e-01
2   Baidu Template 1   DuckDuckGo      Political    1.448111  2.288313e-01
3   Baidu Template 1       Google  Nationalities   12.544569  3.973595e-04
4   Baidu Template 1        Yahoo  Nationalities    1.087658  2.969907e-01
5   Baidu Template 1   DuckDuckGo  Nationalities   31.228648  2.293562e-08
6   Baidu Template 1       Google        Peoples    4.498980  3.391509e-02
7   Baidu Template 1        Yahoo        Peoples    1.418981  2.335711e-01
8   Baidu Template 1   DuckDuckGo        Peoples   10.118467  1.467911e-03
9   Baidu Template 1       Google         Gender    0.322460  5.701330e-01
10  Baidu Template 1        Yahoo         Gender    5.318183  2.110401e-02
11  Baidu Template 1   DuckDuckGo         Gender   11.254656  7.942353e-04
12  Baidu Template 1     

In [14]:
import pandas as pd
import statsmodels.api as sm

# Load the datasets
file_paths = {
    "Baidu Template 1": '../data/temp/modified_baidu_follow_query_template_1.csv',
    "Baidu Template 2": '../data/temp/modified_baidu_follow_query_template_2.csv',
    "DuckDuckGo": '../data/temp/modified_duckduckgo_follow_query.csv',
    "Google": '../data/temp/modified_google_follow_query.csv',
    "Yahoo": '../data/temp/modified_yahoo_follow_query.csv'
}

# Load the datasets
dfs = {}
for engine, path in file_paths.items():
    df = pd.read_csv(path)
    # Convert 'common_sentiment_rating' to numeric, treating non-numeric as NaN
    df['common_sentiment_rating'] = pd.to_numeric(df['common_sentiment_rating'], errors='coerce')
    dfs[engine] = df

# Function to compare common groups and categories in Baidu templates with other search engines
def compare_common_groups(dfs):
    results = {}
    baidu_engines = ["Baidu Template 1", "Baidu Template 2"]
    other_engines = ["Google", "Yahoo", "DuckDuckGo"]
    categories = set.intersection(*(set(df['category'].dropna()) for df in dfs.values()))
    common_groups = set.intersection(*(set(df['group'].dropna()) for df in dfs.values()))

    for group in common_groups:
        for category in categories:
            for baidu_engine in baidu_engines:
                baidu_df = dfs[baidu_engine][(dfs[baidu_engine]['group'] == group) & 
                                             (dfs[baidu_engine]['category'] == category)]

                for other_engine in other_engines:
                    other_df = dfs[other_engine][(dfs[other_engine]['group'] == group) & 
                                                 (dfs[other_engine]['category'] == category)]

                    # Count negative sentiment and others
                    count_negative_baidu = baidu_df['common_sentiment_rating'].eq(3).sum()
                    count_others_baidu = baidu_df['common_sentiment_rating'].count() - count_negative_baidu

                    count_negative_other = other_df['common_sentiment_rating'].eq(3).sum()
                    count_others_other = other_df['common_sentiment_rating'].count() - count_negative_other

                    # Construct contingency table
                    contingency_table = [[count_negative_baidu, count_others_baidu], 
                                         [count_negative_other, count_others_other]]

                    # Perform Chi-square test
                    test_result = sm.stats.Table2x2(contingency_table).test_nominal_association()
                    chi2 = test_result.statistic
                    p = test_result.pvalue

                    # Store the results
                    results[(baidu_engine, other_engine, group, category)] = {
                        'chi2': chi2,
                        'p-value': p
                    }

    return results

# Comparing common groups and categories between Baidu Templates and other engines
comparison_results = compare_common_groups(dfs)

# Print or analyze the results as needed
for key, value in comparison_results.items():
    print(f"{key}: Chi-square = {value['chi2']}, P-value = {value['p-value']}")


In [15]:
import pandas as pd
import statsmodels.api as sm

file_paths = {
    "Baidu Template 1": '../data/temp/modified_baidu_follow_query_template_1.csv',
    "Baidu Template 2": '../data/temp/modified_baidu_follow_query_template_2.csv',
    "DuckDuckGo": '../data/temp/modified_duckduckgo_follow_query.csv',
    "Google": '../data/temp/modified_google_follow_query.csv',
    "Yahoo": '../data/temp/modified_yahoo_follow_query.csv'
}

# Loading the datasets into DataFrames
dfs = {engine: pd.read_csv(path) for engine, path in file_paths.items()}

# Combining Baidu Template 1 and 2 and removing duplicates based on 'group'
dfs["Combined Baidu"] = pd.concat([dfs["Baidu Template 1"], dfs["Baidu Template 2"]]).drop_duplicates(subset='group')

# Function to compare Combined Baidu with other search engines using Chi-square test
def compare_combined_baidu(dfs):
    baidu_engine = "Combined Baidu"
    other_engines = ["Google", "Yahoo", "DuckDuckGo"]
    results = {}
    categories = set(dfs[baidu_engine]['category'].dropna())

    for category in categories:
        baidu_df = dfs[baidu_engine][dfs[baidu_engine]['category'] == category]

        for other_engine in other_engines:
            other_df = dfs[other_engine][dfs[other_engine]['category'] == category]

            # Count negative sentiment (common_sentiment_rating == 3) and others
            count_negative_baidu = baidu_df['common_sentiment_rating'].eq(3).sum()
            count_others_baidu = baidu_df['common_sentiment_rating'].count() - count_negative_baidu

            count_negative_other = other_df['common_sentiment_rating'].eq(3).sum()
            count_others_other = other_df['common_sentiment_rating'].count() - count_negative_other

            # Construct contingency table
            contingency_table = [[count_negative_baidu, count_others_baidu], 
                                 [count_negative_other, count_others_other]]

            # Perform Chi-square test
            test_result = sm.stats.Table2x2(contingency_table).test_nominal_association()
            chi2 = test_result.statistic
            p = test_result.pvalue

            # Store the results
            results[(baidu_engine, other_engine, category)] = {
                'chi2': chi2,
                'p-value': p
            }

    return results

# Comparing Combined Baidu with Google, Yahoo, and DuckDuckGo
comparison_results_combined_baidu = compare_combined_baidu(dfs)

# Displaying the comparison results
comparison_results_combined_baidu


{('Combined Baidu', 'Google', 'Political'): {'chi2': 0.27777777777777757,
  'p-value': 0.598161452683528},
 ('Combined Baidu', 'Yahoo', 'Political'): {'chi2': 4.704301075268818,
  'p-value': 0.03008723326295426},
 ('Combined Baidu', 'DuckDuckGo', 'Political'): {'chi2': 0.12222222222222222,
  'p-value': 0.7266366971638503},
 ('Combined Baidu', 'Google', 'Nationalities'): {'chi2': 4.914089824213251,
  'p-value': 0.026638495010083973},
 ('Combined Baidu', 'Yahoo', 'Nationalities'): {'chi2': 3.476263500052428,
  'p-value': 0.06225516181341606},
 ('Combined Baidu',
  'DuckDuckGo',
  'Nationalities'): {'chi2': 15.113830553066968, 'p-value': 0.00010121900630910563},
 ('Combined Baidu', 'Google', 'Peoples'): {'chi2': 0.012878048780487785,
  'p-value': 0.9096488363925204},
 ('Combined Baidu', 'Yahoo', 'Peoples'): {'chi2': 0.9072907290729074,
  'p-value': 0.3408342913912614},
 ('Combined Baidu', 'DuckDuckGo', 'Peoples'): {'chi2': 2.1365319865319865,
  'p-value': 0.14382671048754392},
 ('Combined

In [16]:
# Updating the function to compare the combined Baidu dataset with other search engines

def compare_combined_baidu(dfs, baidu_engine):
    other_engines = ["Google", "Yahoo", "DuckDuckGo"]
    results = []
    categories = set(dfs[baidu_engine]['category'].dropna())

    for category in categories:
        baidu_df = dfs[baidu_engine][dfs[baidu_engine]['category'] == category]
        for other_engine in other_engines:
            other_df = dfs[other_engine][dfs[other_engine]['category'] == category]

            # Count negative sentiment and others
            count_negative_baidu = baidu_df['sentiment_rating_full_suggestion'].eq(3).sum()
            count_others_baidu = baidu_df['sentiment_rating_full_suggestion'].count() - count_negative_baidu
            count_negative_other = other_df['sentiment_rating_full_suggestion'].eq(3).sum()
            count_others_other = other_df['sentiment_rating_full_suggestion'].count() - count_negative_other

            # Construct contingency table and perform Chi-square test
            contingency_table = [[count_negative_baidu, count_others_baidu], [count_negative_other, count_others_other]]
            test_result = sm.stats.Table2x2(contingency_table).test_nominal_association()

            # Store the results in a list of dictionaries
            results.append({
                'Baidu Engine': baidu_engine,
                'Other Engine': other_engine,
                'Category': category,
                'Chi-Square': test_result.statistic,
                'P-Value': test_result.pvalue
            })

    return pd.DataFrame(results)

# Combining Baidu Template 1 and 2 and removing duplicates based on 'group'
combined_baidu = pd.concat([dfs["Baidu Template 1"], dfs["Baidu Template 2"]]).drop_duplicates(subset='group')
dfs["Combined Baidu"] = combined_baidu

# Comparing Combined Baidu with other search engines
comparison_results_df_combined_baidu = compare_combined_baidu(dfs, "Combined Baidu")
comparison_results_df_combined_baidu



Unnamed: 0,Baidu Engine,Other Engine,Category,Chi-Square,P-Value
0,Combined Baidu,Google,Political,1.75,0.1858767
1,Combined Baidu,Yahoo,Political,1.322449,0.250153
2,Combined Baidu,DuckDuckGo,Political,0.4,0.5270893
3,Combined Baidu,Google,Nationalities,11.761496,0.0006046861
4,Combined Baidu,Yahoo,Nationalities,0.102776,0.7485233
5,Combined Baidu,DuckDuckGo,Nationalities,0.139635,0.7086442
6,Combined Baidu,Google,Peoples,9.652643,0.001890794
7,Combined Baidu,Yahoo,Peoples,1.750337,0.1858344
8,Combined Baidu,DuckDuckGo,Peoples,0.048953,0.8248947
9,Combined Baidu,Google,Gender,36.62441,1.432269e-09


In [20]:
import pandas as pd
import statsmodels.api as sm

# Paths to the datasets
file_paths = {
    "Baidu Template 1": '../data/temp/modified_baidu_follow_query_template_1.csv',
    "Baidu Template 2": '../data/temp/modified_baidu_follow_query_template_2.csv',
    "Google": '../data/temp/modified_google_follow_query.csv'
}

# Loading the datasets into DataFrames
dfs = {engine: pd.read_csv(path) for engine, path in file_paths.items()}

# Combining Baidu Template 1 and 2 and removing duplicates based on 'group'
combined_baidu = pd.concat([dfs["Baidu Template 1"], dfs["Baidu Template 2"]]).drop_duplicates(subset='group')

# Loading the Google dataset
google = dfs["Google"]

# Identify common groups
common_groups = set(combined_baidu['English']).intersection(set(google['group']))

# Filter each dataset for common groups
combined_baidu_common = combined_baidu[combined_baidu['English'].isin(common_groups)]
google_common = google[google['group'].isin(common_groups)]

# Function to perform Chi-square test for each common group in each category
def compare_common_groups(baidu_df, google_df):
    results = {}
    categories = set(baidu_df['category'].dropna())

    for category in categories:
        baidu_category_df = baidu_df[baidu_df['category'] == category]
        google_category_df = google_df[google_df['category'] == category]

        for group in common_groups:
            baidu_group_df = baidu_category_df[baidu_category_df['group'] == group]
            google_group_df = google_category_df[google_category_df['group'] == group]

            # Count negative sentiment (common_sentiment_rating == 3) and others
            count_negative_baidu = baidu_group_df['common_sentiment_rating'].eq(3).sum()
            count_others_baidu = baidu_group_df['common_sentiment_rating'].count() - count_negative_baidu

            count_negative_google = google_group_df['common_sentiment_rating'].eq(3).sum()
            count_others_google = google_group_df['common_sentiment_rating'].count() - count_negative_google

            # Construct contingency table
            contingency_table = [[count_negative_baidu, count_others_baidu], 
                                 [count_negative_google, count_others_google]]

            # Perform Chi-square test
            test_result = sm.stats.Table2x2(contingency_table).test_nominal_association()
            chi2 = test_result.statistic
            p = test_result.pvalue

            # Store the results
            results[(category, group)] = {
                'chi2': chi2,
                'p-value': p
            }

    return results

# Compare common groups between Baidu and Google
comparison_results = compare_common_groups(combined_baidu_common, google_common)
comparison_results


{('Political', 'stepfathers'): {'chi2': 0.0, 'p-value': 1.0},
 ('Political', 'Swedes'): {'chi2': 0.0, 'p-value': 1.0},
 ('Political', 'punks'): {'chi2': 0.0, 'p-value': 1.0},
 ('Political', 'old people'): {'chi2': 0.0, 'p-value': 1.0},
 ('Political', 'Europeans'): {'chi2': 0.0, 'p-value': 1.0},
 ('Political', 'Indians'): {'chi2': 0.0, 'p-value': 1.0},
 ('Political', 'brothers'): {'chi2': 0.0, 'p-value': 1.0},
 ('Political', 'Scots'): {'chi2': 0.0, 'p-value': 1.0},
 ('Political', 'celebrities'): {'chi2': 0.0, 'p-value': 1.0},
 ('Political', 'mothers'): {'chi2': 0.0, 'p-value': 1.0},
 ('Political', 'Syrians'): {'chi2': 0.0, 'p-value': 1.0},
 ('Political', 'stepmothers'): {'chi2': 0.0, 'p-value': 1.0},
 ('Political', 'nerds'): {'chi2': 0.0, 'p-value': 1.0},
 ('Political', 'immigrants'): {'chi2': 0.03428571428571428,
  'p-value': 0.8531003889790089},
 ('Political', 'Moroccans'): {'chi2': 0.0, 'p-value': 1.0},
 ('Political', 'Norwegians'): {'chi2': 0.0, 'p-value': 1.0},
 ('Political', 'Colo

In [21]:
import pandas as pd
import statsmodels.api as sm

# Paths to the datasets
file_paths = {
    "Baidu Template 1": '../data/temp/modified_baidu_follow_query_template_1.csv',
    "Baidu Template 2": '../data/temp/modified_baidu_follow_query_template_2.csv',
    "Google": '../data/temp/modified_google_follow_query.csv'
}

# Loading the datasets into DataFrames
dfs = {engine: pd.read_csv(path) for engine, path in file_paths.items()}

# Combining Baidu Template 1 and 2 and removing duplicates based on 'group'
combined_baidu = pd.concat([dfs["Baidu Template 1"], dfs["Baidu Template 2"]]).drop_duplicates(subset='group')

# Loading the Google dataset
google = dfs["Google"]

# Identify common groups
common_groups = set(combined_baidu['English']).intersection(set(google['group']))

# Filter each dataset for common groups
combined_baidu_common = combined_baidu[combined_baidu['English'].isin(common_groups)]
google_common = google[google['group'].isin(common_groups)]

# Function to perform Chi-square test for each category in each common group
def compare_categories_in_common_groups(baidu_df, google_df):
    results = {}
    for group in common_groups:
        baidu_group_df = baidu_df[baidu_df['group'] == group]
        google_group_df = google_df[google_df['group'] == group]

        categories = set(baidu_group_df['category']).union(set(google_group_df['category']))

        for category in categories:
            baidu_category_df = baidu_group_df[baidu_group_df['category'] == category]
            google_category_df = google_group_df[google_group_df['category'] == category]

            # Count negative sentiment (common_sentiment_rating == 3) and others
            count_negative_baidu = baidu_category_df['common_sentiment_rating'].eq(3).sum()
            count_others_baidu = baidu_category_df['common_sentiment_rating'].count() - count_negative_baidu

            count_negative_google = google_category_df['common_sentiment_rating'].eq(3).sum()
            count_others_google = google_category_df['common_sentiment_rating'].count() - count_negative_google

            # Construct contingency table
            contingency_table = [[count_negative_baidu, count_others_baidu], 
                                 [count_negative_google, count_others_google]]

            # Perform Chi-square test
            test_result = sm.stats.Table2x2(contingency_table).test_nominal_association()
            chi2 = test_result.statistic
            p = test_result.pvalue

            # Store the results
            results[(group, category)] = {
                'chi2': chi2,
                'p-value': p
            }

    return results

# Compare categories within common groups between Baidu and Google
comparison_results = compare_categories_in_common_groups(combined_baidu_common, google_common)
comparison_results


{('stepfathers', 'Gender'): {'chi2': 0.06944444444444439,
  'p-value': 0.7921473917958973},
 ('Swedes', 'Nationalities'): {'chi2': 0.4937142857142857,
  'p-value': 0.48227510850450095},
 ('punks', 'Lifestyle'): {'chi2': 0.0888888888888889,
  'p-value': 0.7655944839957641},
 ('old people', 'Age'): {'chi2': 0.315, 'p-value': 0.574628070589583},
 ('Europeans', 'Peoples'): {'chi2': 0.0694444444444444,
  'p-value': 0.7921473917958972},
 ('Indians', 'Nationalities'): {'chi2': 0.0, 'p-value': 1.0},
 ('brothers', 'Gender'): {'chi2': 0.16761904761904758,
  'p-value': 0.682236562435615},
 ('Scots', 'Nationalities'): {'chi2': 0.0, 'p-value': 1.0},
 ('celebrities', 'Lifestyle'): {'chi2': 0.004183006535947711,
  'p-value': 0.9484318393195734},
 ('mothers', 'Gender'): {'chi2': 0.25249597423510467,
  'p-value': 0.6153230429550354},
 ('Syrians', 'Nationalities'): {'chi2': 0.5739795918367346,
  'p-value': 0.44868185197701504},
 ('stepmothers', 'Gender'): {'chi2': 0.315, 'p-value': 0.574628070589583},
 

In [23]:
import pandas as pd
import statsmodels.api as sm

# Paths to the datasets
file_paths = {
    "Baidu Template 1": '../data/temp/modified_baidu_follow_query_template_1.csv',
    "Baidu Template 2": '../data/temp/modified_baidu_follow_query_template_2.csv',
    "Google": '../data/temp/modified_google_follow_query.csv'
}

# Loading the datasets into DataFrames
dfs = {engine: pd.read_csv(path) for engine, path in file_paths.items()}

# Combining Baidu Template 1 and 2 and removing duplicates based on 'group'
combined_baidu = pd.concat([dfs["Baidu Template 1"], dfs["Baidu Template 2"]]).drop_duplicates(subset='group')

# Loading the Google dataset
google = dfs["Google"]

# Identify common groups
common_groups = set(combined_baidu['English']).intersection(set(google['group']))

# Filter each dataset for common groups
combined_baidu_common = combined_baidu[combined_baidu['English'].isin(common_groups)]
google_common = google[google['group'].isin(common_groups)]

# Function to perform Chi-square test for each category in each common group
def compare_categories_in_common_groups(baidu_df, google_df):
    results = {}
    for group in common_groups:
        baidu_group_df = baidu_df[baidu_df['English'] == group]
        google_group_df = google_df[google_df['group'] == group]

        categories = set(baidu_group_df['category']).union(set(google_group_df['category']))

        for category in categories:
            baidu_category_df = baidu_group_df[baidu_group_df['category'] == category]
            google_category_df = google_group_df[google_group_df['category'] == category]

            # Count negative sentiment (common_sentiment_rating == 3) and others
            count_negative_baidu = baidu_category_df['common_sentiment_rating'].eq(3).sum()
            count_others_baidu = baidu_category_df['common_sentiment_rating'].count() - count_negative_baidu

            count_negative_google = google_category_df['common_sentiment_rating'].eq(3).sum()
            count_others_google = google_category_df['common_sentiment_rating'].count() - count_negative_google

            # Construct contingency table
            contingency_table = [[count_negative_baidu, count_others_baidu], 
                                 [count_negative_google, count_others_google]]

            # Perform Chi-square test
            test_result = sm.stats.Table2x2(contingency_table).test_nominal_association()
            chi2 = test_result.statistic
            p = test_result.pvalue

            # Store the results
            results[(group, category)] = {
                'chi2': chi2,
                'p-value': p
            }

    return results

# Compare categories within common groups between Baidu and Google
comparison_results = compare_categories_in_common_groups(combined_baidu_common, google_common)
comparison_results


{('stepfathers', 'Gender'): {'chi2': 0.0, 'p-value': 1.0},
 ('Swedes', 'Nationalities'): {'chi2': 1.6170000000000002,
  'p-value': 0.20351060264386134},
 ('punks', 'Lifestyle'): {'chi2': 0.44999999999999996,
  'p-value': 0.502334954360502},
 ('old people', 'Age'): {'chi2': 0.8711111111111111,
  'p-value': 0.3506478897044589},
 ('Europeans', 'Peoples'): {'chi2': 0.0, 'p-value': 1.0},
 ('Indians', 'Nationalities'): {'chi2': 0.09722222222222228,
  'p-value': 0.7551888894226784},
 ('brothers', 'Gender'): {'chi2': 0.006845238095238094,
  'p-value': 0.9340614956547423},
 ('Scots', 'Nationalities'): {'chi2': 0.13392857142857145,
  'p-value': 0.7143930376343262},
 ('celebrities', 'Lifestyle'): {'chi2': 0.09777777777777788,
  'p-value': 0.7545128626937243},
 ('mothers', 'Gender'): {'chi2': 1.0330434782608695,
  'p-value': 0.3094449125794545},
 ('Syrians', 'Nationalities'): {'chi2': 1.3718820861678005,
  'p-value': 0.24148877833190419},
 ('stepmothers', 'Gender'): {'chi2': 0.0888888888888889,
  