-
Notifications
You must be signed in to change notification settings - Fork 0
/
__init__.py
415 lines (361 loc) · 19 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
from functools import reduce
from itertools import permutations
from typing import Union
import pandas as pd
from a_pandas_ex_plode_tool import all_nans_in_df_to_pdNA
from a_pandas_ex_df_to_string import ds_to_string
from pandas.core.base import PandasObject
def qq_s_value_counts_to_column(df: pd.Series) -> pd.Series:
"""
df = pd.read_csv("https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/titanic.csv")
df2.Sex.ds_value_counts_to_column()
PassengerId Survived Pclass ... Fare Cabin Embarked
504 505 1 1 ... 86.5000 B79 S
781 782 1 1 ... 57.0000 B20 S
855 856 1 3 ... 9.3500 NaN S
552 553 0 3 ... 7.8292 NaN Q
777 778 1 3 ... 12.4750 NaN S
.. ... ... ... ... ... ... ...
756 757 0 3 ... 7.7958 NaN S
224 225 1 1 ... 90.0000 C93 S
488 489 0 3 ... 8.0500 NaN S
309 310 1 1 ... 56.9292 E36 C
581 582 1 1 ... 110.8833 C68 C
[446 rows x 12 columns]
df2.Sex.ds_value_counts_to_column()
Out[22]:
0 152
1 152
2 152
3 294
4 152
...
441 294
442 294
443 294
444 152
445 152
Name: 0, Length: 446, dtype: int64
This method could also be useful, when you are comparing DataFrames, since it counts the different values in a Series
and returns a DataFrame that you can merge with your original DataFrame
Parameters
df: pd.Series
Returns
pd.DataFrame
"""
series_ = df.copy()
try:
return (
pd.Series(series_.value_counts().to_dict())
.reindex(series_)
.to_frame()
.reset_index()[0]
)
except Exception:
series_ = series_.qq_ds_to_string()
return (
pd.Series(series_.value_counts().to_dict())
.reindex(series_)
.to_frame()
.reset_index()[0]
)
def filter_same_dfs_columns(*args) -> list:
args_ = [x.to_frame().copy() if isinstance(x, pd.Series) else x for x in args]
comuncols = list(
reduce(set.intersection, [set(x.columns.to_list()) for x in args_])
)
passdfs = [dfaf[[x for x in comuncols]].copy() for dfaf in args_]
return passdfs
def set_intersections_df(
*args, accept_df_with_different_columns: bool = True
) -> pd.DataFrame:
"""
Computes the intersection of n DataFrames/Series
Example
df = pd.read_csv("https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/titanic.csv")
#Let's create some DataFrames with random data from df
df1 = df.sample(len(df) - len(df)//2).copy()
df2 = df.sample(len(df) - len(df)//2).copy()
df3 = df.sample(len(df) - len(df)//2).copy()
df4 = df.sample(len(df) - len(df)//2).copy()
df5 = df.sample(len(df) - len(df)//2).copy()
df1.ds_set_intersections(df2) #Comparing 2 DataFrames
Out[14]:
Parch PassengerId Fare Survived ... SibSp Embarked Sex Cabin
0 1 802 26.2500 1 ... 1 S female NaN
1 0 506 108.9000 0 ... 1 C male C65
2 0 386 73.5000 0 ... 0 S male NaN
3 0 621 14.4542 0 ... 1 C male NaN
4 1 273 19.5000 1 ... 0 S female NaN
.. ... ... ... ... ... ... ... ... ...
439 0 240 12.2750 0 ... 0 S male NaN
440 0 235 10.5000 0 ... 0 S male NaN
441 1 269 153.4625 1 ... 0 S female C125
442 0 394 113.2750 1 ... 1 C female D36
443 0 400 12.6500 1 ... 0 S female NaN
[444 rows x 12 columns]
df1.ds_set_intersections(df2,df3) #Comparing 3 DataFrames
Out[15]:
Parch PassengerId Fare Survived ... SibSp Embarked Sex Cabin
0 0 506 108.9000 0 ... 1 C male C65
1 1 480 12.2875 1 ... 0 S female NaN
2 1 581 30.0000 1 ... 1 S female NaN
3 1 447 19.5000 1 ... 0 S female NaN
4 0 16 16.0000 1 ... 0 S female NaN
.. ... ... ... ... ... ... ... ... ...
340 2 154 14.5000 0 ... 0 S male NaN
341 0 668 7.7750 0 ... 0 S male NaN
342 0 702 26.2875 1 ... 0 S male E24
343 0 610 153.4625 1 ... 0 S female C125
344 0 450 30.5000 1 ... 0 S male C104
[345 rows x 12 columns]
df1.ds_set_intersections(df2,df3, df4) #Comparing 4 DataFrames
Out[16]:
Parch PassengerId Fare Survived ... SibSp Embarked Sex Cabin
0 0 506 108.9000 0 ... 1 C male C65
1 1 581 30.0000 1 ... 1 S female NaN
2 0 283 9.5000 0 ... 0 S male NaN
3 0 488 29.7000 0 ... 0 C male B37
4 0 610 153.4625 1 ... 0 S female C125
.. ... ... ... ... ... ... ... ... ...
227 0 23 8.0292 1 ... 0 Q female NaN
228 1 619 39.0000 1 ... 2 S female F4
229 2 473 27.7500 1 ... 1 S female NaN
230 0 253 26.5500 0 ... 0 S male C87
231 0 618 16.1000 0 ... 1 S female NaN
[232 rows x 12 columns]
df1.ds_set_intersections(df2,df3, df4, df5) #Comparing 5 DataFrames
Out[17]:
Parch PassengerId Fare Survived ... SibSp Embarked Sex Cabin
0 0 506 108.9000 0 ... 1 C male C65
1 1 581 30.0000 1 ... 1 S female NaN
2 1 17 29.1250 0 ... 4 Q male NaN
3 2 59 27.7500 1 ... 1 S female NaN
4 0 463 38.5000 0 ... 0 S male E63
.. ... ... ... ... ... ... ... ... ...
140 2 166 20.5250 1 ... 0 S male NaN
141 0 705 7.8542 0 ... 1 S male NaN
142 1 51 39.6875 0 ... 4 S male NaN
143 0 833 7.2292 0 ... 0 C male NaN
144 2 154 14.5000 0 ... 0 S male NaN
[145 rows x 12 columns]
"""
if accept_df_with_different_columns:
args = filter_same_dfs_columns(*args)
originalcolumns = args[0].columns
dfa = set_check_dfs(*args, setfunction=set.intersection)
return dfa.filter(originalcolumns)
def set_symmetric_difference_df(
*args, accept_df_with_different_columns: bool = True
) -> pd.DataFrame:
"""
Computes the symmetric difference of n DataFrames/Series
Example
df = pd.read_csv("https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/titanic.csv")
#Let's create some DataFrames with random data from df
df1 = df.sample(len(df) - len(df)//2).copy()
df2 = df.sample(len(df) - len(df)//2).copy()
df3 = df.sample(len(df) - len(df)//2).copy()
df4 = df.sample(len(df) - len(df)//2).copy()
df5 = df.sample(len(df) - len(df)//2).copy()
df1.ds_set_symmetric_difference(df2) #Comparing 2 DataFrames
Out[18]:
Parch PassengerId Fare ... Embarked Sex Cabin
0 0 567 7.8958 ... S male NaN
1 0 46 8.0500 ... S male NaN
2 2 342 263.0000 ... S female C23 C25 C27
3 0 845 8.6625 ... S male NaN
4 0 1 7.2500 ... S male NaN
.. ... ... ... ... ... ... ...
219 0 865 13.0000 ... S male NaN
220 5 639 39.6875 ... S female NaN
221 0 30 7.8958 ... S male NaN
222 0 332 28.5000 ... S male C124
223 0 884 10.5000 ... S male NaN
[448 rows x 12 columns]
df1.ds_set_symmetric_difference(df2,df3) #Comparing 3 DataFrames
Out[19]:
Parch PassengerId Fare Survived ... SibSp Embarked Sex Cabin
0 0 567 7.8958 0 ... 0 S male NaN
1 0 46 8.0500 0 ... 0 S male NaN
2 0 845 8.6625 0 ... 0 S male NaN
3 0 142 7.7500 1 ... 0 S female NaN
4 0 579 14.4583 0 ... 1 C female NaN
.. ... ... ... ... ... ... ... ... ...
106 0 430 8.0500 1 ... 0 S male E10
107 1 363 14.4542 0 ... 0 C female NaN
108 1 531 26.0000 1 ... 1 S female NaN
109 0 748 13.0000 1 ... 0 S female NaN
110 0 876 7.2250 1 ... 0 C female NaN
[339 rows x 12 columns]
df1.ds_set_symmetric_difference(df2,df3,df4) #Comparing 4 DataFrames
Out[20]:
Parch PassengerId Fare Survived ... SibSp Embarked Sex Cabin
0 0 567 7.8958 0 ... 0 S male NaN
1 0 46 8.0500 0 ... 0 S male NaN
2 0 142 7.7500 1 ... 0 S female NaN
3 0 579 14.4583 0 ... 1 C female NaN
4 0 365 15.5000 0 ... 1 Q male NaN
.. ... ... ... ... ... ... ... ... ...
39 2 551 110.8833 1 ... 0 C male C70
40 0 19 18.0000 0 ... 1 S female NaN
41 0 615 8.0500 0 ... 0 S male NaN
42 0 204 7.2250 0 ... 0 C male NaN
43 1 375 21.0750 0 ... 3 S female NaN
[204 rows x 12 columns]
df1.ds_set_symmetric_difference(df2,df3,df4,df5) #Comparing 5 DataFrames
Out[21]:
Parch PassengerId Fare Survived ... SibSp Embarked Sex Cabin
0 0 567 7.8958 0 ... 0 S male NaN
1 0 579 14.4583 0 ... 1 C female NaN
2 0 365 15.5000 0 ... 1 Q male NaN
3 0 644 56.4958 1 ... 0 S male NaN
4 0 708 26.2875 1 ... 0 S male E24
.. ... ... ... ... ... ... ... ... ...
25 0 343 13.0000 0 ... 0 S male NaN
26 0 656 73.5000 0 ... 2 S male NaN
27 0 407 7.7500 0 ... 0 S male NaN
28 0 301 7.7500 1 ... 0 Q female NaN
29 0 819 6.4500 0 ... 0 S male NaN
[125 rows x 12 columns]
Parameters
args: Union[pd.Series, pd.DataFrame]
DataFrames or Series, how many you want
accept_df_with_different_columns: bool=True
Let's say you have one DataFrame whose columns are: [Parch, PassengerId, Fare, Survived, SibSp,Embarked, Sex, Cabin]
If you want to compare it to: [Flight, Fare, Survived, SibSp,Embarked, Sex, Cabin]
It won't work, unless you pass accept_df_with_different_columns=True
Only the columns that are in all dataframes will be compared
Returns
pd.DataFrame
"""
if accept_df_with_different_columns:
args = filter_same_dfs_columns(*args)
originalcolumns = args[0].columns
updateddfs = []
for_settemp = "for_set____________________"
test = list([ds_to_string(all_nans_in_df_to_pdNA(x)) for x in args])
for df1s in test:
df1s[for_settemp] = df1s.apply(
lambda x: str(x.__array__()[1:].tolist()), axis=1
)
updateddfs.append(df1s.copy())
perm = permutations([x[for_settemp].to_list() for x in updateddfs])
onlyones = []
for i in list(perm):
half_results = list(reduce(set.symmetric_difference, [set(x) for x in i]))
if any(half_results):
for single_result in half_results:
together = [
True if sublist.count(single_result) > 0 else False for sublist in i
]
isgoodresult = together.count(True)
if isgoodresult == 1:
onlyones.append(single_result)
onlyones = list(set(onlyones))
allgoodindex = []
for ini, dataf in enumerate(updateddfs):
goodindex = dataf.loc[dataf[for_settemp].isin(onlyones)].index
tmpdf = args[ini].loc[goodindex].copy()
tmpdf["aa_original_index"] = tmpdf.index.__array__().copy()
tmpdf["aa_dfposition"] = ini
allgoodindex.append(tmpdf.reset_index(drop=True).copy())
return pd.concat(allgoodindex).filter(originalcolumns)
def set_union_df(*args, accept_df_with_different_columns: bool = True) -> pd.DataFrame:
"""
df = pd.read_csv("https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/titanic.csv")
#Let's create some DataFrames with random data from df
df1 = df.sample(len(df) - len(df)//2).copy()
df2 = df.sample(len(df) - len(df)//2).copy()
df3 = df.sample(len(df) - len(df)//2).copy()
df4 = df.sample(len(df) - len(df)//2).copy()
df5 = df.sample(len(df) - len(df)//2).copy()
df1[['PassengerId','Survived','Name']].ds_set_union(df2[['Pclass','Cabin','Name']])
Out[17]:
Name
0 Carbines, Mr. William
1 Sundman, Mr. Johan Julian
2 Dimic, Mr. Jovan
3 Harder, Mr. George Achilles
4 Rice, Master. Eugene
.. ...
887 Carlsson, Mr. August Sigfrid
888 Hoyt, Mr. Frederick Maxfield
889 Somerton, Mr. Francis William
890 Francatelli, Miss. Laura Mabel
891 Thayer, Mrs. John Borland (Marian Longstreth M...
If, for whatever reason, you don't want to use pd.concat(), you can use this method.
Don't use this method if you can use pd.concat
Parameters
args: Union[pd.Series, pd.DataFrame]
DataFrames or Series, how many you want
accept_df_with_different_columns: bool=True
Let's say you have one DataFrame whose columns are: [Parch, PassengerId, Fare, Survived, SibSp,Embarked, Sex, Cabin]
If you want to compare it to: [Flight, Fare, Survived, SibSp,Embarked, Sex, Cabin]
It won't work, unless you pass accept_df_with_different_columns=True
Only the columns that are in all dataframes will be compared
Returns
pd.DataFrame
"""
if accept_df_with_different_columns:
args = filter_same_dfs_columns(*args)
originalcolumns = args[0].columns
dfa = set_check_dfs(*args, setfunction=set.union)
return dfa.filter(originalcolumns)
def set_check_dfs(*args, setfunction) -> pd.DataFrame:
for_settemp = "for_set____________________"
togethercols = args[0].columns.to_list()
alldataframesconverted = []
for dfr in args:
dfr1 = all_nans_in_df_to_pdNA(dfr)
df1s = ds_to_string(dfr1)
alldataframesconverted.append(df1s.copy())
alldataframesconverted3tmp = []
for col in togethercols:
allcols = [x[col] for x in alldataframesconverted]
results1 = list(reduce(setfunction, [set(x.tolist()) for x in allcols]))
for datafr in alldataframesconverted:
alldataframesconverted3tmp.append(
datafr.loc[datafr[col].isin(results1)].copy()
)
alldataframesconverted = alldataframesconverted3tmp.copy()
alldataframesconverted3tmp.clear()
updateddfs = []
for df1s in alldataframesconverted:
df1s[for_settemp] = df1s.apply(
lambda x: str(x.__array__()[1:].tolist()), axis=1
)
updateddfs.append(df1s.copy())
results1 = list(
reduce(setfunction, [set(x[for_settemp].to_list()) for x in updateddfs])
)
dict_final = {}
for ini, df1s in enumerate(updateddfs):
dict_final[ini] = df1s.loc[df1s[for_settemp].isin(results1)].index
dict_final_dfs = []
for key, item in dict_final.items():
tempdf = args[key].loc[item].copy()
tempdf["aa_original_index"] = tempdf.index.__array__().copy()
tempdf["aa_dfposition"] = key
dict_final_dfs.append(tempdf.copy())
return pd.concat(dict_final_dfs, ignore_index=True)
def series_to_dataframe(
df: Union[pd.Series, pd.DataFrame]
) -> (Union[pd.Series, pd.DataFrame], bool):
dataf = df.copy()
isseries = False
if isinstance(dataf, pd.Series):
columnname = dataf.name
dataf = dataf.to_frame()
try:
dataf.columns = [columnname]
except Exception:
dataf.index = [columnname]
dataf = dataf.T
isseries = True
return dataf, isseries
def pd_add_set():
PandasObject.ds_value_counts_to_column = qq_s_value_counts_to_column
PandasObject.ds_set_intersections = set_intersections_df
PandasObject.ds_set_symmetric_difference = set_symmetric_difference_df
PandasObject.ds_set_union = set_union_df