<a href="https://colab.research.google.com/github/jonathan80622/Columbia-E4579/blob/main/services/backend/src/recommendation_system/recommendation_flow/filtering/fall_2023/BetaFilter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import pandas as pd
import numpy as np
from typing import Tuple
from google.colab import drive
drive.mount('/content/drive')

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# DataCollector - Do Not Modify

In [None]:
from sqlalchemy.sql.schema import ScalarElementColumnDefault
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np

class DataCollector:
    def artist_styles_one_hot(self):
        raise NotImplementedError(
            "you need to implement this, needs to be two lists, one for string one for coefficient, coefficient list is one larger to account for 'other'"
        )

    def sources_one_hot(self):
        raise NotImplementedError(
            "you need to implement this, needs to be two lists, one for string one for coefficient, coefficient list is one larger to account for 'other'"
        )

    def num_inference_steps_one_hot(self):
        raise NotImplementedError(
            "you need to implement this, needs to be two lists, one for string one for coefficient, coefficient list is one larger to account for 'other'"
        )

    def one_hot_encoding_functions(self):
        return zip(
            [self.artist_styles_one_hot(), self.sources_one_hot(), self.num_inference_steps_one_hot()],
            ['artist_style', 'source', 'num_inference_steps']
        )

    def custom_aggregation(self, prefix, data):
        result = {
            f'{prefix}_likes': np.sum((data['engagement_type'] == 'Like') & (data['engagement_value'] == 1)),
            f'{prefix}_dislikes': np.sum((data['engagement_type'] == 'Like') & (data['engagement_value'] == -1)),
            f'{prefix}_engagement_time_avg': data[data['engagement_type'] == 'MillisecondsEngagedWith']['engagement_value'].mean(),
        }
        return pd.Series(result)

    def feature_generation_user(self):
        return self.user_data.groupby('user_id').apply(lambda data: self.custom_aggregation('user', data)).reset_index()

    def feature_generation_content_one_hot_encoding(self):
        for (categories, _coefficient), col_name in self.one_hot_encoding_functions():
            self.generated_content_metadata_data[col_name] = self.generated_content_metadata_data[col_name].apply(lambda x: x if x in categories else 'other')
            encoder = OneHotEncoder(categories=[categories + ['other']], sparse=False)
            encoded_data = encoder.fit_transform(self.generated_content_metadata_data[[col_name]])
            encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out([col_name]))
            for col in encoded_df.columns:
              self.generated_content_metadata_data[col] = encoded_df[col]
        return self.generated_content_metadata_data

    def feature_generation_content_engagement_value(self):
        return self.engagement_data.groupby('content_id').apply(
            lambda data: self.custom_aggregation('content', data)
        ).reset_index()

    def feature_generation(self):
      self.feature_generation_user()
      self.feature_generation_content_one_hot_encoding()
      self.feature_generation_content_engagement_value()

    def get_engagement_data(self, content_ids):
      df = pd.read_csv('sample_data/engagement.csv', sep="\t")
      return df[df['content_id'].isin(content_ids)]

    def get_generated_content_metadata_data(self, content_ids):
      df = pd.read_csv('sample_data/generated_content_metadata.csv', sep="\t")
      return df[df['content_id'].isin(content_ids)]

    def get_user_data(self, user_id):
      df = pd.read_csv('sample_data/engagement.csv', sep="\t")
      return df[df['user_id'] == user_id]

    def gather_data(self, user_id, content_ids):
      self.engagement_data = self.get_engagement_data(content_ids)
      self.generated_content_metadata_data = self.get_generated_content_metadata_data(content_ids)
      self.user_data = self.get_user_data(user_id)

    def gather_training_data(self):
      self.engagement_data = pd.read_csv('sample_data/engagement.csv', sep="\t")
      self.generated_content_metadata_data = pd.read_csv('sample_data/generated_content_metadata.csv', sep="\t")
      self.user_data = pd.read_csv('sample_data/engagement.csv', sep="\t")

    def feature_eng_training(self):
      user_attr = self.feature_generation_user()
      content_engagement_features = self.feature_generation_content_engagement_value()
      generated_content_features = self.feature_generation_content_one_hot_encoding()

      interaction_pairs = self.engagement_data[
          ['user_id', 'content_id']].drop_duplicates()

      self.training_results = pd.merge(
          interaction_pairs,
          user_attr,
          on='user_id',
          how='left'
      ).fillna(0)

      content_results = pd.merge(
          generated_content_features,
          content_engagement_features,
          on='content_id',
          how='left'
      ).fillna(0)

      self.training_results = pd.merge(
          self.training_results,
          content_results,
          on='content_id',
          how='left'
      ).fillna(0)

      return self.training_results

    def feature_eng(self):
      user_attr = self.feature_generation_user()
      content_engagement_features = self.feature_generation_content_engagement_value()
      generated_content_features = self.feature_generation_content_one_hot_encoding()
      self.results = pd.merge(
          generated_content_features,
          content_engagement_features,
          on='content_id',
          how='left'
      ).fillna(0)
      self.results['user_id'] = user_attr['user_id'].iloc[0]
      self.results = pd.merge(
          self.results,
          user_attr,
          on='user_id'
      )

    def threshold(self):
        raise NotImplementedError("you need to implement")

    def coefficients(self):
        return {
            'content_likes': 0.0,
            'content_dislikes': 0.0,
            'content_engagement_time_avg': 0.0,

            'user_likes': 0.0,
            'user_dislikes': 0.0,
            'user_engagement_time_avg': 0.0,
        }

    def get_columns(self):
      cols = list(self.coefficients().keys())
      for (categories, _coefficients), col_name in self.one_hot_encoding_functions():
          for category, coefficient in zip(categories + ['other'], _coefficients):
            cols.append(col_name + "_" + str(category))
      return cols

    def run_linear_model(self):
        coeffs = self.coefficients()
        for (categories, _coefficients), col_name in self.one_hot_encoding_functions():
          for category, coefficient in zip(categories + ['other'], _coefficients):
            coeffs[col_name + "_" + str(category)] = coefficient

        self.results['linear_output'] = 0.0
        for col_name, _coefficient in coeffs.items():
            self.results['linear_output'] += self.results[col_name] * _coefficient
        return self.results[self.results['linear_output'] >= self.threshold()]['content_id'].values

    def filter_content_ids(self, user_id, content_ids):
      self.gather_data(user_id, content_ids)
      self.feature_eng()
      return self.run_linear_model()

# Your Implementation - Example Here, Must Modify

In [None]:
class DataCollectorImplExample(DataCollector):
  def __init__(self):
     self.artist_styles_categories = ['van_gogh', 'jean-michel_basquiat', 'detailed_portrait', 'kerry_james_marshall', 'medieval']
     self.sources_categories = ['human_prompts', 'r/Showerthoughts', 'r/EarthPorn', 'r/scifi', 'r/pics']

  def artist_styles_one_hot(self):
    return self.artist_styles_categories, [
        0.08419699216085753, -0.07851817041961259, 0.056928033643688675, -0.028233440372052384, 0.09795210363137236, -0.02846836075845514
    ]

  def sources_one_hot(self):
    return self.sources_categories, [
        0.08969932181089971, -0.010294043640662762, 0.09210472353549924, 0.0183836782636583, -0.03587177847683946, -0.05016474360675957
    ]

  def get_user_data(self, user_id):
    df = pd.read_csv('sample_data/engagement.csv', sep="\t").merge(self.generated_content_metadata_data, on=['content_id'], how='left')
    return df[df['user_id'] == user_id]

  def gather_training_data(self):
    self.engagement_data = pd.read_csv('sample_data/engagement.csv', sep="\t")
    self.generated_content_metadata_data = pd.read_csv('sample_data/generated_content_metadata.csv', sep="\t")
    self.user_data = pd.read_csv('sample_data/engagement.csv', sep="\t").merge(self.generated_content_metadata_data, on=['content_id'], how='left')

  def custom_aggregation(self, prefix, data):
        result = {
            f'{prefix}_likes': np.sum((data['engagement_type'] == 'Like') & (data['engagement_value'] == 1)),
            f'{prefix}_dislikes': np.sum((data['engagement_type'] == 'Like') & (data['engagement_value'] == -1)),
            f'{prefix}_engagement_time_avg': data[data['engagement_type'] == 'MillisecondsEngagedWith']['engagement_value'].mean(),
        }
        if prefix == 'user':
          for artist_style in self.artist_styles_categories:
            result.update(
                {
                    f'{prefix}_{artist_style}_likes': np.sum((data['engagement_type'] == 'Like') & (data['engagement_value'] == 1) & (data['artist_style'] == artist_style)),
                    f'{prefix}_{artist_style}_dislikes': np.sum((data['engagement_type'] == 'Like') & (data['engagement_value'] == -1) & (data['artist_style'] == artist_style)),
                }
            )
          for source in self.sources_categories:
            result.update(
                {
                    f'{prefix}_{source}_likes': np.sum((data['engagement_type'] == 'Like') & (data['engagement_value'] == 1) & (data['source'] == source)),
                    f'{prefix}_{source}_dislikes': np.sum((data['engagement_type'] == 'Like') & (data['engagement_value'] == -1) & (data['source'] == source)),
                }
            )
        return pd.Series(result)

  def feature_generation_user(self):
      return self.user_data.groupby('user_id').apply(lambda data: self.custom_aggregation('user', data)).reset_index()

  def feature_generation_content_engagement_value(self):
      return self.engagement_data.groupby('content_id').apply(
        lambda data: self.custom_aggregation('content', data)
      ).reset_index()

  def num_inference_steps_one_hot(self):
    return [
        100
    ], [
        0.07907408574691606, 0.024783072138878422
    ]

  def one_hot_encoding_functions(self):
    return zip(
        [self.artist_styles_one_hot(), self.sources_one_hot(), self.num_inference_steps_one_hot()],
        ['artist_style', 'source', 'num_inference_steps']
    )

  def coefficients(self):
        return {
            'content_likes': 0.0393806685305066,
            'content_dislikes': -0.1251121639248738,
            'content_engagement_time_avg': 1.7047921436141263e-11,

            'user_likes': 5.4764371246059006e-05,
            'user_dislikes': -0.00038144676253312266,
            'user_engagement_time_avg': 1.3979832806922074e-07,

            'user_van_gogh_likes': -0.0015617486488857127,
            'user_van_gogh_dislikes': 0.007679110213892672,
            'user_jean-michel_basquiat_likes': -0.0009773921103118932,
            'user_jean-michel_basquiat_dislikes': -0.0008799250096847402,
            'user_detailed_portrait_likes': -0.0017054374720878109,
            'user_detailed_portrait_dislikes': 0.0006448387863193699,
            'user_kerry_james_marshall_likes': 0.010037528570189526,
            'user_kerry_james_marshall_dislikes': 0.01263553743524961,
            'user_medieval_likes': -0.002036888897848471,
            'user_medieval_dislikes': 0.0067493455744942355,
            'user_human_prompts_likes': 0.00046769278872973313,
            'user_human_prompts_dislikes': -0.0007526283000997576,
            'user_r/Showerthoughts_likes': 0.0014862618080521658,
            'user_r/Showerthoughts_dislikes': -0.0035579735080894007,
            'user_r/EarthPorn_likes': -7.742487892404582e-05,
            'user_r/EarthPorn_dislikes': -0.0021741285998859673,
            'user_r/scifi_likes': 0.002639451581008141,
            'user_r/scifi_dislikes': 0.004024616643409577,
            'user_r/pics_likes': 0.0027597431193407715,
            'user_r/pics_dislikes': 0.006782971979724735,
        }

  def threshold(self):
    return 0.35

# Example For Use In Production

In [None]:
data_collector = DataCollectorImplExample()
random_content_ids = pd.read_csv('sample_data/generated_content_metadata.csv', sep="\t")['content_id'].values
print(len(random_content_ids))

output = data_collector.filter_content_ids(17, random_content_ids)
print(len(output))
#print(output)

107705




7701


# Training

In [None]:
#@title get training data
data_collector = DataCollectorImplExample()
data_collector.gather_training_data()
training_data = data_collector.feature_eng_training()



In [None]:
#@title code to implement
def get_Y(engagement_data: pd.DataFrame) -> pd.DataFrame:
    """Engineers taget variable.
    Args
      data (pd.DataFrame): Engagement data.
    Returns
      pd.DataFrame: Dataframe of 3 columns; 'user_id', 'content_id', 'score',
        where 'score' being the target variable that you want to predict.
    """
    # Dummy target dataframe. Your output dataframe should have 3 columns; 'user_id', 'content_id', 'score'
    # Where 'score' being the target variable that you want to predict.

#    target_df = engagement_data.groupby(
#        ['user_id', 'content_id']
#        )['engagement_value'].sum().rename('score',
#                                           inplace=True
#                                           ).to_frame().reset_index()

    def custom_aggregation(group):
        if 'Like' in group['engagement_type'].values:
            return group[group['engagement_type'] == 'Like']['engagement_value'].sum()
        else:
            return 0

    target_df = engagement_data.groupby(['user_id', 'content_id']).apply(custom_aggregation).reset_index(name='score')

    # DO NOT CHANGE THIS. This step ensures that each row of the target variable (X)
    # corresponds to the correct row of features (y).
    target_df = pd.merge(
          training_data[['user_id', 'content_id']],
          target_df,
          on=['user_id', 'content_id'],
          how='left'
      )

    return target_df['score']

engagement_data = pd.read_csv('sample_data/engagement.csv', sep="\t")
X = training_data[data_collector.get_columns()]
y = get_Y(engagement_data)

In [None]:
X.head()

Unnamed: 0,content_likes,content_dislikes,content_engagement_time_avg,user_likes,user_dislikes,user_engagement_time_avg,user_van_gogh_likes,user_van_gogh_dislikes,user_jean-michel_basquiat_likes,user_jean-michel_basquiat_dislikes,...,artist_style_medieval,artist_style_other,source_human_prompts,source_r/Showerthoughts,source_r/EarthPorn,source_r/scifi,source_r/pics,source_other,num_inference_steps_100,num_inference_steps_other
0,0.0,3.0,1950.0,11.0,2.0,10456.625,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,0.0,1.0,3411.0,11.0,2.0,10456.625,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,1.0,1.0,3453.0,11.0,2.0,10456.625,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,4.0,1.0,2528.555556,11.0,2.0,10456.625,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,1898.5,11.0,2.0,10456.625,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
y.value_counts()

 0    81384
 1    44632
-1    35475
 2       23
-2        6
-4        3
 3        3
-3        1
 5        1
Name: score, dtype: int64

In [None]:
# training
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

# Split data into train and test: Add/change  other parametersas you wish
# Also, feel free to use cross validation
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Depending on what your target variable y looks like, you have to choose a suitable model.
# Here, I assume y is binary, and so I use Logistic Regression.
model = LinearRegression()

parameters = {
    "positive": [True, False],
    "fit_intercept": [True, False],
}

grid_search = GridSearchCV(model, parameters, cv=5)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)


{'fit_intercept': False, 'positive': False}


In [None]:
y_pred = grid_search.predict(X_train)

print(f"MSE: {np.mean((y_pred - y_train)**2)}")

MSE: 0.3809555863543207


In [None]:
y_pred = grid_search.predict(X_test)

print(f"MSE: {np.mean((y_pred - y_test)**2)}")

MSE: 0.38183784768066237


In [None]:
len(X_test)

40382

In [None]:
linear_reg_threshold = 0.55
len(y_pred[y_pred > linear_reg_threshold])

2694

# What You Need

In [None]:
print("{")
for x, y in zip(grid_search.best_estimator_.feature_names_in_, grid_search.best_estimator_.coef_):
  print(f"\t{x}: {y},")
print("}")

# Policy Filtering 1

In [None]:
def policy_filter_one(training_data, content_id, target_size=100):
  def filter_by_style(content):
    df = content.groupby('content_id').agg({'artist_style':lambda x:x.iloc[0], 'user_likes':'sum'})

    proportions = df.artist_style.value_counts(normalize=True)
    proportions[proportions>0.2] = 0.2
    P = proportions.sum()

    filtered_content_ids = []
    for cat, pro in proportions.items():
        filtered_content_ids.extend(
                          list(df[df.artist_style==cat].sort_values(by='user_likes',ascending=False)[:int(pro*target_size/P)].index)
                                    )

    # over = proportions[proportions>0.2].index

    # filtered_content_ids = []

    # for cat in set(proportions.index):
    #     if cat in over:
    #         filtered_content_ids.extend(df[df.artist_style==cat].sample(int(0.2*len(df))).index)
    #     else:
    #         filtered_content_ids.extend(df[df.artist_style==cat].index)

    return filtered_content_ids

  filtered_content = filter_by_style(training_data)
  print(len(filtered_content))
  print(filtered_content)
  return filtered_content



policy_filter_one(
    training_data,
    random_content_ids[0]
)

# Policy Filtering 2

In [None]:
def policy_filter_two(training_data, content_id, target_size=100):
  df = training_data.groupby('content_id').agg({'content_likes':'sum', 'content_dislikes':'sum'})

  def filter_by_likes(contents):
      contents['popular'] = contents.content_likes>0
      value_counts = contents['popular'].value_counts(normalize=True)
      print(value_counts)
      p = value_counts.loc[True]
      q = value_counts.loc[False]

      a = 0.8/p
      b = 0.2/q


      filtered_content_ids = pd.concat([
          contents[contents.popular].sort_values(by='content_likes')[:int(a*target_size/(a+b))],
          contents[contents.popular].sort_values(by='content_likes')[:int(b*target_size/(a+b))]
      ]).index

      return list(filtered_content_ids)
  filtered_content = filter_by_likes(df)
  print(len(filtered_content))
  print(filtered_content)
  return filtered_content


policy_filter_two(
    training_data,
    random_content_ids[0]
)

**Executive Summary**

Here summarizes the following testing on generated candidates.

1. Two-tower generated candidates:

1.1 Linear regression threshold = 0.35

Average filtered content id number = 51.5


1.2 policy 1 target = 100

Average filtered content id number = 97

1.3 policy 1 target = 100

Filtered content id number = 99 (for all users)

Two tower total = 247


2. Your Choice generated candidates:

2.1 Linear regression threshold = 0.35

Average filtered content id number = 94


2.2 policy 1 target = 100

Filtered content id number = 96

2.3 policy 1 target = 100

Filtered content id number = 99 (for all users)

Two tower total = 289


//

Two Tower Generated Candidates data testing

In [None]:
candidates_by_users = pd.read_csv('sample_data/Candidates_by_Users.csv', index_col=0)
candidates_by_users.head()

Unnamed: 0,user_1,user_4,user_5,user_6,user_7,user_8,user_9,user_10,user_11,user_12,...,user_65,user_66,user_67,user_69,user_70,user_71,user_74,user_75,user_76,user_77
0,41206,29917,89959,28723,30102,32463,28838,29917,33907,28838,...,28838,33907,33907,33907,33907,70481,28954,29388,82650,28838
1,29954,32728,79978,30884,31252,29757,29917,28954,28838,31148,...,28801,33290,33290,33290,75259,43902,28723,51321,57627,32463
2,35675,33290,118750,31978,31832,31148,33290,28723,33290,32463,...,28954,29388,38357,29388,46104,91264,29272,34626,45572,33290
3,65899,31728,74592,29272,33138,29917,29688,31239,29265,29019,...,28861,29954,29388,29265,29688,37264,29160,33805,79495,31148
4,33287,29482,70481,28748,28801,32049,29265,32463,29388,33769,...,32375,38357,29917,29917,41967,90124,31239,30879,29451,33907


In [None]:
linear_reg_threshold = 0.35
num = list()
for i in range(69):
  user_content_id = candidates_by_users.iloc[:,i]
  use_X = X.iloc[user_content_id]
  user_pred = grid_search.predict(use_X)
  filtered_content_id = use_X[user_pred > linear_reg_threshold]
  num.append(len(filtered_content_id))

In [None]:
mean(num)

51.43478260869565

In [None]:
num_policy1 = list()
for i in range(69):
  user_content_id = candidates_by_users.iloc[:,i]
  user_training = training_data.iloc[user_content_id]
  filtered_content_id_policy1 = policy_filter_one(
      user_training,
      random_content_ids[0]
  )
  num_policy1.append(len(filtered_content_id_policy1))

In [None]:
#mean(num_policy1)

97.05797101449275

In [None]:
num_policy2 = list()
for i in range(69):
  user_content_id = candidates_by_users.iloc[:,i]
  user_training = training_data.iloc[user_content_id]
  filtered_content_id_policy2 = policy_filter_two(
      user_training,
      random_content_ids[0]
  )
  num_policy2.append(len(filtered_content_id_policy2))


In [None]:
#for the linear regression filtering, the max, min and mean number of filtered contents are: threshold = 0.35
print(f"Max: {max(num)}")
print(f"Min: {min(num)}")
print(f"Average: {mean(num)}")


Max: 100
Min: 28
Average: 51.43478260869565


In [None]:
#for policy 1 filtering, the max, min and mean number of filtered contents are: target num = 100
print(f"Max: {max(num_policy1)}")
print(f"Min: {min(num_policy1)}")
print(f"Average: {mean(num_policy1)}")

Max: 99
Min: 96
Average: 97.05797101449275


In [None]:
#for policy 2 filtering, the max, min and mean number of filtered contents are: target num = 100
print(f"Max: {max(num_policy2)}")
print(f"Min: {min(num_policy2)}")
print(f"Average: {mean(num_policy2)}")

Max: 99
Min: 99
Average: 99


Your Choice Generated Candidates data testing

In [None]:
yourchoice_by_users = pd.read_csv('sample_data/YourChoiceGenerator.csv', index_col=0)
yourchoice_by_users.head()

Unnamed: 0,content_id
0,112718
1,81576
2,85954
3,123413
4,90800


In [None]:
linear_reg_threshold = 0.35
yourchoice_content_id = yourchoice_by_users['content_id']
yourchoice_X = X.iloc[yourchoice_content_id]
yourchoice_pred = grid_search.predict(yourchoice_X)
yourchoice_content_id = yourchoice_X[yourchoice_pred > linear_reg_threshold]
len(yourchoice_content_id)

94

In [None]:
yourchoice_content_id = yourchoice_by_users['content_id']
yourchoice_training = training_data.loc[yourchoice_content_id]
yourchoice_content_id_policy1 = policy_filter_one(
    yourchoice_training,
    random_content_ids[0]
  )
len(yourchoice_content_id_policy1)

yourchoice_content_id_policy2 = policy_filter_two(
    yourchoice_training,
    random_content_ids[0]
  )
len(yourchoice_content_id_policy2)

96
[136303, 41457, 118995, 43691, 77017, 38489, 77535, 83994, 85534, 63914, 103565, 90203, 96302, 36495, 96061, 91061, 121410, 92727, 61263, 129802, 135257, 46832, 134678, 133579, 132562, 132293, 45283, 130289, 56780, 86235, 106969, 42166, 100358, 71963, 125555, 97159, 127060, 99203, 101170, 105903, 77396, 109514, 111920, 83556, 46693, 90909, 58606, 76647, 84116, 98271, 94810, 105922, 112304, 112061, 110384, 106911, 113023, 29726, 29309, 102906, 84632, 33074, 129332, 82520, 59494, 124754, 124131, 122947, 110976, 64802, 55074, 125762, 32627, 49926, 110174, 90557, 113222, 78843, 123670, 130576, 124002, 112410, 113313, 116237, 112990, 116102, 122893, 118660, 122115, 112277, 113033, 119158, 118679, 118878, 121495, 114448]
True     0.693712
False    0.306288
Name: popular, dtype: float64
99
[28723, 84116, 84567, 84964, 85013, 89546, 90109, 84040, 90203, 135796, 94810, 95542, 96302, 98271, 98288, 92727, 98557, 83994, 82391, 65152, 65744, 66714, 66889, 66967, 70760, 82490, 71963, 73445, 74597

99