In [365]:
import pandas as pd
import numpy as np
from pathlib import Path

PATH = Path("books_after_merge.csv")  
df = pd.read_csv(PATH)
df.shape

  df = pd.read_csv(PATH)


(25375, 66)

In [366]:
stats = pd.DataFrame({
    "column": df.columns,
    "non_null_cnt": [df[c].notna().sum() for c in df.columns],
    "null_cnt": [df[c].isna().sum() for c in df.columns],
    "null_ratio": [df[c].isna().mean() for c in df.columns],
    "nunique": [df[c].nunique(dropna=True) for c in df.columns],
    "dtype": [str(df[c].dtype) for c in df.columns],
})
stats["all_null"] = stats["null_cnt"] == len(df)
stats["almost_all_null"] = stats["null_ratio"] > 0.98
stats["is_constant"] = stats["nunique"] <= 1
stats["is_norm_like"] = stats["column"].str.contains(r'(?:^|_)norm(?:_|$)', case=False, regex=True)
stats["is_equal_flag"] = stats["column"].str.endswith("_equal_all")

def suffix_of(c):
    for suf in ("_litres","_litnet","_authoday"):
        if c.endswith(suf):
            return suf
    return ""
stats["suffix"] = stats["column"].apply(suffix_of)

stats.sort_values(["is_norm_like","is_equal_flag","all_null","almost_all_null","is_constant","null_ratio","column"],
                  ascending=[False,False,False,False,False,False,True]).head(20)

Unnamed: 0,column,non_null_cnt,null_cnt,null_ratio,nunique,dtype,all_null,almost_all_null,is_constant,is_norm_like,is_equal_flag,suffix
37,author_norm_y,2001,23374,0.921143,794,object,False,False,False,True,False,
38,title_norm_y,2001,23374,0.921143,1982,object,False,False,False,True,False,
54,title_norm,12038,13337,0.525596,8322,object,False,False,False,True,False,
53,author_norm,12040,13335,0.525517,1283,object,False,False,False,True,False,
25,author_norm_x,14062,11313,0.445833,3708,object,False,False,False,True,False,
26,title_norm_x,14063,11312,0.445793,11335,object,False,False,False,True,False,
65,age_equal_all,25375,0,0.0,1,bool,False,False,True,False,True,
61,categories_equal_all,25375,0,0.0,1,bool,False,False,True,False,True,
58,date_equal_all,25375,0,0.0,1,bool,False,False,True,False,True,
63,description_equal_all,25375,0,0.0,1,bool,False,False,True,False,True,


In [367]:

KEEP_CORE = [c for c in ["join_key","has_litres","has_litnet","has_authoday"] if c in df.columns]

KEEP_STRICT = [c for c in ["title","authors","price","date","rate","categories","description","age","views","likes","comments"]
               if c in df.columns]

KEEP_UNIQUE = [c for c in df.columns if c.endswith(("_litres","_litnet","_authoday"))
               and any(k in c for k in [
                   "isbn", "id_litnet", "url_litres",
                   "views_", "likes_", "comments_",
                   "rating_count", "reviews_count",
                   "release_date", "written_date",
                   "pages", "copyright_holder",
                   "formats", "cycle_", "exclusive_",
                   "symbols_count", "a4_sheets"
               ])]

DROP_TECH = stats.loc[stats["is_norm_like"] | stats["is_equal_flag"], "column"].tolist()
DROP_CONST = stats.loc[(stats["is_constant"]) & (~stats["column"].isin(KEEP_CORE)), "column"].tolist()
DROP_ALLNULL = stats.loc[stats["all_null"], "column"].tolist()

DROP = sorted(set(DROP_TECH + DROP_CONST + DROP_ALLNULL))

len(KEEP_CORE), len(KEEP_STRICT), len(KEEP_UNIQUE), len(DROP)

(4, 8, 18, 23)

In [368]:

cols_trim = [c for c in df.columns if c not in DROP]
df_trim = df[cols_trim].copy()

df_trim.shape, df_trim.columns


((25375, 43),
 Index(['join_key', 'has_litres', 'has_litnet', 'has_authoday', 'title',
        'authors', 'url_litres', 'title_litres', 'authors_litres',
        'rating_litres', 'rating_count_litres', 'reviews_count_litres',
        'price_litres', 'genres_litres', 'age_limit_litres',
        'release_date_litres', 'written_date_litres', 'pages_litres',
        'isbn_litres', 'copyright_holder_litres', 'formats_litres',
        'description_litres', 'id_litnet', 'title_litnet', 'author_litnet',
        'genre_litnet', 'price_litnet', 'year_litnet', 'description_litnet',
        'title_authoday', 'authors_authoday', 'categories_authoday',
        'date_authoday', 'symbols_count_authoday', 'a4_sheets_authoday',
        'views_authoday', 'likes_authoday', 'comments_authoday',
        'reviews_authoday', 'price_authoday', 'cycle_authoday',
        'exclusive_authoday', 'annotation_authoday'],
       dtype='object'))

In [369]:

split_cols = df_trim["join_key"].str.split(" \| ", n=1, expand=True)
split_cols.columns = ["author_from_key", "title_from_key"]

df_trim = pd.concat([df_trim, split_cols], axis=1)

if "authors" in df_trim.columns:
    df_trim["authors"] = df_trim["authors"].fillna(df_trim["author_from_key"])
else:
    df_trim["authors"] = df_trim["author_from_key"]

if "title" in df_trim.columns:
    df_trim["title"] = df_trim["title"].fillna(df_trim["title_from_key"])
else:
    df_trim["title"] = df_trim["title_from_key"]

df_trim = df_trim.drop(columns=["author_from_key", "title_from_key"])

df_trim[["join_key","authors","title"]].head(10)

  split_cols = df_trim["join_key"].str.split(" \| ", n=1, expand=True)


Unnamed: 0,join_key,authors,title
0,4itaka | –∞–±—Å–æ–ª—é—Ç–Ω–æ–µ –∑–Ω–∞–Ω–∏–µ —Ä–∞–∑–≤—Ä–∞—â–∞–µ—Ç –∞–±—Å–æ–ª—é—Ç–Ω–æ,4itaka,–∞–±—Å–æ–ª—é—Ç–Ω–æ–µ –∑–Ω–∞–Ω–∏–µ —Ä–∞–∑–≤—Ä–∞—â–∞–µ—Ç –∞–±—Å–æ–ª—é—Ç–Ω–æ
1,4k zeus | rwby –≤–æ –∏–º—è —Å–∏–ª—ã —é–Ω–æ—Å—Ç–∏ —è,4k zeus,rwby –≤–æ –∏–º—è —Å–∏–ª—ã —é–Ω–æ—Å—Ç–∏ —è
2,ach | –∫—Ä–æ—Å—Å,ach,–∫—Ä–æ—Å—Å
3,afkter | —ç–ª—å—Ñ–∏–Ω–≥–∞—Ä–¥,afkter,—ç–ª—å—Ñ–∏–Ω–≥–∞—Ä–¥
4,alchoz | –≥–∞—Å—Ç–∞—Ä–±–∞–π—Ç–µ—Ä,alchoz,–≥–∞—Å—Ç–∞—Ä–±–∞–π—Ç–µ—Ä
5,alchoz | –¥—Ä—É–≥–æ–π —à–∏–Ω–∏–≥–∞–º–∏,alchoz,–¥—Ä—É–≥–æ–π —à–∏–Ω–∏–≥–∞–º–∏
6,aleks hom | —Å–∏—Å—Ç–µ–º–∞ —Ä–∞–∑–≤–∏—Ç–∏—è –≥–∏–ª—å–¥–∏–∏ –≤ –º–∏—Ä–µ fa...,aleks hom,—Å–∏—Å—Ç–µ–º–∞ —Ä–∞–∑–≤–∏—Ç–∏—è –≥–∏–ª—å–¥–∏–∏ –≤ –º–∏—Ä–µ fairy tail
7,aleksey nik | –≤–æ—Ä–æ–∂–µ—è,aleksey nik,–≤–æ—Ä–æ–∂–µ—è
8,aleksey nik | –¥—Ä–∞–∫–æ–Ω—å—è –∫—Ä–æ–≤—å,aleksey nik,–¥—Ä–∞–∫–æ–Ω—å—è –∫—Ä–æ–≤—å
9,alena milagro | –æ–º—É—Ç,alena milagro,–æ–º—É—Ç


In [370]:
df_trim = df_trim.drop(columns='join_key')

In [371]:
df_trim.to_csv("books_after_trim.csv", index=False)

In [372]:

UNIFY_FIELDS = {
    "title": ["title_litres", "title_authoday", "title_litnet"],
    "authors": ["authors_litres", "authors_authoday", "author_litnet"],
    "genre": ["genres_litres", "categories_authoday", "genre_litnet"],
    "description": ["description_litres", "annotation_authoday", "description_litnet"],
    "age": ["age_limit_litres", "age_litnet"],
}

def coalesce(df, cols):
    cols = [c for c in cols if c in df.columns]
    if not cols:
        return np.nan
    return df[cols].bfill(axis=1).iloc[:, 0]  

for new_col, src_cols in UNIFY_FIELDS.items():
    df_trim[new_col] = coalesce(df_trim, src_cols)
    filled = df_trim[new_col].notna().sum()
    print(f"{new_col}: –æ–±—ä–µ–¥–∏–Ω–µ–Ω–æ –∏–∑ {len(src_cols)} –ø–æ–ª–µ–π, –Ω–µ–ø—É—Å—Ç—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π = {filled}")

to_drop = [c for cols in UNIFY_FIELDS.values() for c in cols if c in df_trim.columns]
df_trim = df_trim.drop(columns=to_drop)

print("\nüßπ –£–¥–∞–ª–µ–Ω–æ —Å—Ç–∞—Ä—ã—Ö –∫–æ–ª–æ–Ω–æ–∫:", len(to_drop))
print(" –î–æ–±–∞–≤–ª–µ–Ω—ã –Ω–æ–≤—ã–µ:", list(UNIFY_FIELDS.keys()))
df_trim.head(5)

title: –æ–±—ä–µ–¥–∏–Ω–µ–Ω–æ –∏–∑ 3 –ø–æ–ª–µ–π, –Ω–µ–ø—É—Å—Ç—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π = 25375
authors: –æ–±—ä–µ–¥–∏–Ω–µ–Ω–æ –∏–∑ 3 –ø–æ–ª–µ–π, –Ω–µ–ø—É—Å—Ç—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π = 25374
genre: –æ–±—ä–µ–¥–∏–Ω–µ–Ω–æ –∏–∑ 3 –ø–æ–ª–µ–π, –Ω–µ–ø—É—Å—Ç—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π = 25375
description: –æ–±—ä–µ–¥–∏–Ω–µ–Ω–æ –∏–∑ 3 –ø–æ–ª–µ–π, –Ω–µ–ø—É—Å—Ç—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π = 25375
age: –æ–±—ä–µ–¥–∏–Ω–µ–Ω–æ –∏–∑ 2 –ø–æ–ª–µ–π, –Ω–µ–ø—É—Å—Ç—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π = 14063

üßπ –£–¥–∞–ª–µ–Ω–æ —Å—Ç–∞—Ä—ã—Ö –∫–æ–ª–æ–Ω–æ–∫: 13
 –î–æ–±–∞–≤–ª–µ–Ω—ã –Ω–æ–≤—ã–µ: ['title', 'authors', 'genre', 'description', 'age']


Unnamed: 0,has_litres,has_litnet,has_authoday,title,authors,url_litres,rating_litres,rating_count_litres,reviews_count_litres,price_litres,...,views_authoday,likes_authoday,comments_authoday,reviews_authoday,price_authoday,cycle_authoday,exclusive_authoday,genre,description,age
0,False,False,True,–ê–±—Å–æ–ª—é—Ç–Ω–æ–µ –∑–Ω–∞–Ω–∏–µ - —Ä–∞–∑–≤—Ä–∞—â–∞–µ—Ç –∞–±—Å–æ–ª—é—Ç–Ω–æ,4itaka,,,,,,...,36K,595,36,0.0,–ë–µ—Å–ø–ª–∞—Ç–Ω–æ,–°–∫–∞–π—Ä–∏–º,–ù–µ—Ç,"–†–æ–º–∞–Ω, –§–∞–Ω—Ñ–∏–∫, –§—ç–Ω—Ç–µ–∑–∏, –ü–æ–ø–∞–¥–∞–Ω—Ü—ã",–ü—Ä–æ–¥–æ–ª–∂–µ–Ω–∏–µ –ø–æ—Ö–æ–∂–¥–µ–Ω–∏–π –ø–æ–ø–∞–¥–∞–Ω—Ü–∞ —Å—Ç—É–ø–∏–≤—à–µ–≥–æ –Ω–∞...,
1,False,False,True,"RWBY. –í–æ –ò–º—è –°–∏–ª—ã –Æ–Ω–æ—Å—Ç–∏, –Ø...",4k_ZEUS,,,,,,...,88K,1¬†314,388,0.0,–ë–µ—Å–ø–ª–∞—Ç–Ω–æ,"–í–æ –ò–º—è –°–∏–ª—ã –Æ–Ω–æ—Å—Ç–∏, –Ø...",–ù–µ—Ç,"–†–æ–º–∞–Ω, –§–∞–Ω—Ñ–∏–∫, –§—ç–Ω—Ç–µ–∑–∏, –ü—Ä–∏–∫–ª—é—á–µ–Ω–∏—è",–ü–æ–ø–∞–¥–∞–Ω–µ—Ü –≤ –º–∏—Ä RWBY —Å –≤–Ω–µ—à–Ω–æ—Å—Ç—å—é –ú–∞–π—Ç–æ –ì–∞—è –∏–∑...,
2,False,False,True,–ö—Ä–æ—Å—Å,Ach,,,,,,...,410K,1¬†306,453,1.0,120 ‚ÇΩ,–†–∏—Ç–º,–ù–µ—Ç,"–†–æ–º–∞–Ω, –ö–æ—Å–º–∏—á–µ—Å–∫–∞—è —Ñ–∞–Ω—Ç–∞—Å—Ç–∏–∫–∞, –ü–æ–ø–∞–¥–∞–Ω—Ü—ã –≤ –∫–æ—Å...","–ü—Ä–∏—Å—É—Ç—Å—Ç–≤—É—é—Ç –º–æ—Ç–∏–≤—ã EVE. –ì–ì –±—É–¥–µ—Ç: –ø–∞—Ö–∞—Ç—å, –ø–æ—Ç...",
3,False,False,True,–≠–ª—å—Ñ–∏–Ω–≥–∞—Ä–¥,Afkter,,,,,,...,393K,2¬†337,339,0.0,–ë–µ—Å–ø–ª–∞—Ç–Ω–æ,–ê—Ä–¥–∞,–ù–µ—Ç,"–†–æ–º–∞–Ω, –§–∞–Ω—Ñ–∏–∫, –ü–æ–ø–∞–¥–∞–Ω—Ü—ã, –§—ç–Ω—Ç–µ–∑–∏",–≠—Ç–æ –∏—Å—Ç–æ—Ä–∏—è –æ –ø–µ—Ä–µ—Ä–æ–∂–¥–µ–Ω–∏–∏ –¥—É—à–∏ —á–µ–ª–æ–≤–µ–∫–∞ –≤ –Ω–æ–≤...,
4,False,False,True,–ì–∞—Å—Ç–∞—Ä–±–∞–π—Ç–µ—Ä,alchoz,,,,,,...,178K,910,125,0.0,–ë–µ—Å–ø–ª–∞—Ç–Ω–æ,,–ù–µ—Ç,"–†–æ–º–∞–Ω, –§–∞–Ω—Ñ–∏–∫, –ü–æ–ø–∞–¥–∞–Ω—Ü—ã, –ü–æ–ø–∞–¥–∞–Ω—Ü—ã –≤ –º–∞–≥–∏—á–µ—Å–∫...","–û—á–Ω—É—Ç—å—Å—è –≤ –Ω–æ–≤–æ–º —Ç–µ–ª–µ, –≤ –Ω–µ–∏–∑–≤–µ—Å—Ç–Ω–æ–π –ª–∞–±–æ—Ä–∞—Ç–æ—Ä...",


In [373]:
df_trim.columns

Index(['has_litres', 'has_litnet', 'has_authoday', 'title', 'authors',
       'url_litres', 'rating_litres', 'rating_count_litres',
       'reviews_count_litres', 'price_litres', 'release_date_litres',
       'written_date_litres', 'pages_litres', 'isbn_litres',
       'copyright_holder_litres', 'formats_litres', 'id_litnet',
       'price_litnet', 'year_litnet', 'date_authoday',
       'symbols_count_authoday', 'a4_sheets_authoday', 'views_authoday',
       'likes_authoday', 'comments_authoday', 'reviews_authoday',
       'price_authoday', 'cycle_authoday', 'exclusive_authoday', 'genre',
       'description', 'age'],
      dtype='object')

In [374]:
df_final = df_trim.copy()


DROP_PATTERNS = [
    "id_",
    "isbn",
    "url",
]

drop_extra = [c for c in df_final.columns if any(pat in c for pat in DROP_PATTERNS)]

print(f" –£–¥–∞–ª—è–µ–º —Ç–µ—Ö–Ω–∏—á–µ—Å–∫–∏–µ –ø–æ–ª—è ({len(drop_extra)} —à—Ç.):")
for c in drop_extra:
    print(" -", c)

# —É–¥–∞–ª—è–µ–º
df_final = df_final.drop(columns=drop_extra)

print(f"\n –û—Å—Ç–∞–ª–æ—Å—å {len(df_final.columns)} –∫–æ–ª–æ–Ω–æ–∫ –ø–æ—Å–ª–µ –æ—á–∏—Å—Ç–∫–∏")
df_final.head(5)

 –£–¥–∞–ª—è–µ–º —Ç–µ—Ö–Ω–∏—á–µ—Å–∫–∏–µ –ø–æ–ª—è (3 —à—Ç.):
 - url_litres
 - isbn_litres
 - id_litnet

 –û—Å—Ç–∞–ª–æ—Å—å 29 –∫–æ–ª–æ–Ω–æ–∫ –ø–æ—Å–ª–µ –æ—á–∏—Å—Ç–∫–∏


Unnamed: 0,has_litres,has_litnet,has_authoday,title,authors,rating_litres,rating_count_litres,reviews_count_litres,price_litres,release_date_litres,...,views_authoday,likes_authoday,comments_authoday,reviews_authoday,price_authoday,cycle_authoday,exclusive_authoday,genre,description,age
0,False,False,True,–ê–±—Å–æ–ª—é—Ç–Ω–æ–µ –∑–Ω–∞–Ω–∏–µ - —Ä–∞–∑–≤—Ä–∞—â–∞–µ—Ç –∞–±—Å–æ–ª—é—Ç–Ω–æ,4itaka,,,,,,...,36K,595,36,0.0,–ë–µ—Å–ø–ª–∞—Ç–Ω–æ,–°–∫–∞–π—Ä–∏–º,–ù–µ—Ç,"–†–æ–º–∞–Ω, –§–∞–Ω—Ñ–∏–∫, –§—ç–Ω—Ç–µ–∑–∏, –ü–æ–ø–∞–¥–∞–Ω—Ü—ã",–ü—Ä–æ–¥–æ–ª–∂–µ–Ω–∏–µ –ø–æ—Ö–æ–∂–¥–µ–Ω–∏–π –ø–æ–ø–∞–¥–∞–Ω—Ü–∞ —Å—Ç—É–ø–∏–≤—à–µ–≥–æ –Ω–∞...,
1,False,False,True,"RWBY. –í–æ –ò–º—è –°–∏–ª—ã –Æ–Ω–æ—Å—Ç–∏, –Ø...",4k_ZEUS,,,,,,...,88K,1¬†314,388,0.0,–ë–µ—Å–ø–ª–∞—Ç–Ω–æ,"–í–æ –ò–º—è –°–∏–ª—ã –Æ–Ω–æ—Å—Ç–∏, –Ø...",–ù–µ—Ç,"–†–æ–º–∞–Ω, –§–∞–Ω—Ñ–∏–∫, –§—ç–Ω—Ç–µ–∑–∏, –ü—Ä–∏–∫–ª—é—á–µ–Ω–∏—è",–ü–æ–ø–∞–¥–∞–Ω–µ—Ü –≤ –º–∏—Ä RWBY —Å –≤–Ω–µ—à–Ω–æ—Å—Ç—å—é –ú–∞–π—Ç–æ –ì–∞—è –∏–∑...,
2,False,False,True,–ö—Ä–æ—Å—Å,Ach,,,,,,...,410K,1¬†306,453,1.0,120 ‚ÇΩ,–†–∏—Ç–º,–ù–µ—Ç,"–†–æ–º–∞–Ω, –ö–æ—Å–º–∏—á–µ—Å–∫–∞—è —Ñ–∞–Ω—Ç–∞—Å—Ç–∏–∫–∞, –ü–æ–ø–∞–¥–∞–Ω—Ü—ã –≤ –∫–æ—Å...","–ü—Ä–∏—Å—É—Ç—Å—Ç–≤—É—é—Ç –º–æ—Ç–∏–≤—ã EVE. –ì–ì –±—É–¥–µ—Ç: –ø–∞—Ö–∞—Ç—å, –ø–æ—Ç...",
3,False,False,True,–≠–ª—å—Ñ–∏–Ω–≥–∞—Ä–¥,Afkter,,,,,,...,393K,2¬†337,339,0.0,–ë–µ—Å–ø–ª–∞—Ç–Ω–æ,–ê—Ä–¥–∞,–ù–µ—Ç,"–†–æ–º–∞–Ω, –§–∞–Ω—Ñ–∏–∫, –ü–æ–ø–∞–¥–∞–Ω—Ü—ã, –§—ç–Ω—Ç–µ–∑–∏",–≠—Ç–æ –∏—Å—Ç–æ—Ä–∏—è –æ –ø–µ—Ä–µ—Ä–æ–∂–¥–µ–Ω–∏–∏ –¥—É—à–∏ —á–µ–ª–æ–≤–µ–∫–∞ –≤ –Ω–æ–≤...,
4,False,False,True,–ì–∞—Å—Ç–∞—Ä–±–∞–π—Ç–µ—Ä,alchoz,,,,,,...,178K,910,125,0.0,–ë–µ—Å–ø–ª–∞—Ç–Ω–æ,,–ù–µ—Ç,"–†–æ–º–∞–Ω, –§–∞–Ω—Ñ–∏–∫, –ü–æ–ø–∞–¥–∞–Ω—Ü—ã, –ü–æ–ø–∞–¥–∞–Ω—Ü—ã –≤ –º–∞–≥–∏—á–µ—Å–∫...","–û—á–Ω—É—Ç—å—Å—è –≤ –Ω–æ–≤–æ–º —Ç–µ–ª–µ, –≤ –Ω–µ–∏–∑–≤–µ—Å—Ç–Ω–æ–π –ª–∞–±–æ—Ä–∞—Ç–æ—Ä...",


In [375]:
df_final.columns

Index(['has_litres', 'has_litnet', 'has_authoday', 'title', 'authors',
       'rating_litres', 'rating_count_litres', 'reviews_count_litres',
       'price_litres', 'release_date_litres', 'written_date_litres',
       'pages_litres', 'copyright_holder_litres', 'formats_litres',
       'price_litnet', 'year_litnet', 'date_authoday',
       'symbols_count_authoday', 'a4_sheets_authoday', 'views_authoday',
       'likes_authoday', 'comments_authoday', 'reviews_authoday',
       'price_authoday', 'cycle_authoday', 'exclusive_authoday', 'genre',
       'description', 'age'],
      dtype='object')

In [376]:
mask = df_final["price_litres"].notna() & df_final["price_authoday"].notna()
count_both = mask.sum()

count_both

np.int64(2603)

In [377]:
import re
import numpy as np

def normalize_price(x):
    """–û—á–∏—â–∞–µ—Ç —Ü–µ–Ω—É –æ—Ç —Å–∏–º–≤–æ–ª–æ–≤ –≤–∞–ª—é—Ç—ã, —Å–ª–æ–≤ '–±–µ—Å–ø–ª–∞—Ç–Ω–æ', –∏ –ø—Ä–∏–≤–æ–¥–∏—Ç –∫ float."""
    if pd.isna(x):
        return np.nan
    s = str(x).strip().lower()
    if any(word in s for word in ["–±–µ—Å–ø–ª–∞—Ç", "free"]):
        return 0.0
    s = re.sub(r"[^0-9,\.]", "", s).replace(",", ".")
    try:
        return float(s)
    except:
        return np.nan

numeric_cols = ["price_litres", "price_authoday", "price_litnet"]
for c in numeric_cols:
    if c in df_final.columns:
        df_final[c] = df_final[c].apply(normalize_price)

In [378]:
df_final[["price_litres","price_authoday","price_litnet",
                "rating_litres","rating_count_litres","reviews_count_litres",
                "views_authoday","likes_authoday","comments_authoday",
                "reviews_authoday","a4_sheets_authoday","symbols_count_authoday"]].sample(10)

Unnamed: 0,price_litres,price_authoday,price_litnet,rating_litres,rating_count_litres,reviews_count_litres,views_authoday,likes_authoday,comments_authoday,reviews_authoday,a4_sheets_authoday,symbols_count_authoday
10192,,150.0,,,,,58K,833,749.0,0.0,1097.0,438799.0
23728,199.0,,,5.0,1.0,0.0,,,,,,
19833,,127.0,,,,,94K,654,116.0,0.0,1144.0,457685.0
18122,490.0,,,5.0,1.0,0.0,,,,,,
23623,299.0,,,4.6,279.0,25.0,,,,,,
8374,,169.0,,,,,179K,1¬†245,386.0,0.0,1208.0,483185.0
4177,164.0,,,5.0,10.0,0.0,,,,,,
16769,249.0,,,3.5,110.0,24.0,,,,,,
5865,249.0,,,4.9,26.0,5.0,,,,,,
15076,,159.0,,,,,170K,1¬†312,420.0,0.0,1202.0,480776.0


In [379]:
df_final = df_final.drop_duplicates()

In [380]:
df_final.to_csv("books_after_trim.csv", index=False)

In [381]:

flags = df_final[["has_litres", "has_litnet", "has_authoday"]].astype(bool)


print("–ö–æ–ª-–≤–æ –∫–Ω–∏–≥ –ø–æ –∏—Å—Ç–æ—á–Ω–∏–∫–∞–º:")
print(flags.sum().to_frame("count"))

from itertools import combinations

pairs = list(combinations(["has_litres", "has_litnet", "has_authoday"], 2))

print("\n –ü–µ—Ä–µ—Å–µ—á–µ–Ω–∏—è (–ø–æ–ø–∞—Ä–Ω–æ):")
for a, b in pairs:
    both = (flags[a] & flags[b]).sum()
    only_a = (flags[a] & ~flags[b]).sum()
    only_b = (~flags[a] & flags[b]).sum()
    print(f"{a} ‚à© {b}: {both:>5} (—Ç–æ–ª—å–∫–æ {a}: {only_a:>5}, —Ç–æ–ª—å–∫–æ {b}: {only_b:>5})")

all_three = (flags["has_litres"] & flags["has_litnet"] & flags["has_authoday"]).sum()
print(f"\n –í–æ –≤—Å–µ—Ö —Ç—Ä—ë—Ö –∏—Å—Ç–æ—á–Ω–∏–∫–∞—Ö: {all_three}")

import pandas as pd

summary = pd.DataFrame({
    "Source": ["LitRes","LitNet","Author.Today","LitRes ‚à© LitNet","LitRes ‚à© Author.Today","LitNet ‚à© Author.Today","All three"],
    "Count": [
        flags["has_litres"].sum(),
        flags["has_litnet"].sum(),
        flags["has_authoday"].sum(),
        (flags["has_litres"] & flags["has_litnet"]).sum(),
        (flags["has_litres"] & flags["has_authoday"]).sum(),
        (flags["has_litnet"] & flags["has_authoday"]).sum(),
        all_three
    ]
})

summary

–ö–æ–ª-–≤–æ –∫–Ω–∏–≥ –ø–æ –∏—Å—Ç–æ—á–Ω–∏–∫–∞–º:
              count
has_litres    14063
has_litnet     2001
has_authoday  12039

 –ü–µ—Ä–µ—Å–µ—á–µ–Ω–∏—è (–ø–æ–ø–∞—Ä–Ω–æ):
has_litres ‚à© has_litnet:   122 (—Ç–æ–ª—å–∫–æ has_litres: 13941, —Ç–æ–ª—å–∫–æ has_litnet:  1879)
has_litres ‚à© has_authoday:  2603 (—Ç–æ–ª—å–∫–æ has_litres: 11460, —Ç–æ–ª—å–∫–æ has_authoday:  9436)
has_litnet ‚à© has_authoday:     8 (—Ç–æ–ª—å–∫–æ has_litnet:  1993, —Ç–æ–ª—å–∫–æ has_authoday: 12031)

 –í–æ –≤—Å–µ—Ö —Ç—Ä—ë—Ö –∏—Å—Ç–æ—á–Ω–∏–∫–∞—Ö: 4


Unnamed: 0,Source,Count
0,LitRes,14063
1,LitNet,2001
2,Author.Today,12039
3,LitRes ‚à© LitNet,122
4,LitRes ‚à© Author.Today,2603
5,LitNet ‚à© Author.Today,8
6,All three,4
