In [52]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, LongType, DateType, TimestampType
from pyspark.sql.functions import upper, pandas_udf, expr
import pyspark.pandas as ps

from datetime import datetime, date
import pandas as pd
import numpy as np
import os

os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

In [53]:
spark = (
    SparkSession.builder
    .appName("PandasToSparkTest")
    .master("local[*]")  # dùng toàn bộ CPU
    .config("spark.driver.bindAddress", "127.0.0.1")
    .config("spark.driver.host", "127.0.0.1")
    .config("spark.python.worker.reuse", "false")   # tránh lỗi worker treo
    .config("spark.network.timeout", "300s")
    .config("spark.executor.heartbeatInterval", "60s")
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")
    .config("spark.sql.ansi.enabled", "false")
    .getOrCreate()
)



In [54]:
s = ps.Series([1, 2, np.nan, 3, 4])
s.to_pandas()



0    1.0
1    2.0
2    NaN
3    3.0
4    4.0
dtype: float64

In [55]:
psdf = ps.DataFrame({
    'a': [1, 2, 3, 4, 5, 6],
    'b': [100, 200, 300, 400, 500, 600],
    'c': ["one", "two", "three", "four", "five", "six"],
}, index=[10, 20, 30, 40, 50, 60])
psdf

Unnamed: 0,a,b,c
10,1,100,one
20,2,200,two
30,3,300,three
40,4,400,four
50,5,500,five
60,6,600,six


In [56]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [57]:
pdf = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
psdf = ps.from_pandas(pdf)

In [58]:
pdf1 = pdf.reindex(index=dates[0:4], columns=list(pdf.columns)+ ['E'])
pdf1

Unnamed: 0,A,B,C,D,E
2013-01-01,0.241827,-0.752449,0.166789,-0.133067,
2013-01-02,-0.650524,1.247453,0.397529,-0.629899,
2013-01-03,-0.542208,-0.819262,2.124829,-0.465471,
2013-01-04,-0.532825,0.441236,0.162853,1.822458,


In [59]:
pdf1.loc[dates[0]:dates[1], 'E'] = 1
pdf1

Unnamed: 0,A,B,C,D,E
2013-01-01,0.241827,-0.752449,0.166789,-0.133067,1.0
2013-01-02,-0.650524,1.247453,0.397529,-0.629899,1.0
2013-01-03,-0.542208,-0.819262,2.124829,-0.465471,
2013-01-04,-0.532825,0.441236,0.162853,1.822458,


In [60]:
psdf1 = ps.from_pandas(pdf1)
psdf1

Unnamed: 0,A,B,C,D,E
2013-01-01,0.241827,-0.752449,0.166789,-0.133067,1.0
2013-01-02,-0.650524,1.247453,0.397529,-0.629899,1.0
2013-01-03,-0.542208,-0.819262,2.124829,-0.465471,
2013-01-04,-0.532825,0.441236,0.162853,1.822458,


In [61]:
psdf1.dropna(how='any')

Unnamed: 0,A,B,C,D,E
2013-01-01,0.241827,-0.752449,0.166789,-0.133067,1.0
2013-01-02,-0.650524,1.247453,0.397529,-0.629899,1.0


In [62]:
psdf1.fillna(value=5, inplace=True)
psdf1

Unnamed: 0,A,B,C,D,E
2013-01-01,0.241827,-0.752449,0.166789,-0.133067,1.0
2013-01-02,-0.650524,1.247453,0.397529,-0.629899,1.0
2013-01-03,-0.542208,-0.819262,2.124829,-0.465471,5.0
2013-01-04,-0.532825,0.441236,0.162853,1.822458,5.0


In [63]:
psdf = ps.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                    'B': ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                    'C': np.random.randn(8),
                    'D': np.random.randn(8)})
psdf

Unnamed: 0,A,B,C,D
0,foo,one,0.642641,0.064395
1,bar,one,1.203357,-0.26374
2,foo,two,-0.400495,-0.400884
3,bar,three,0.334605,2.345493
4,foo,two,0.564882,1.398825
5,bar,two,-1.953852,0.88562
6,foo,one,0.574276,-0.675537
7,foo,three,0.913662,0.956085


In [66]:
psdf.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
foo,one,1.216918,-0.611142
bar,one,1.203357,-0.26374
foo,two,0.164386,0.997941
bar,three,0.334605,2.345493
bar,two,-1.953852,0.88562
foo,three,0.913662,0.956085


In [67]:
pser = pd.Series(np.random.randn(1000),
                 index=pd.date_range('1/1/2000', periods=1000))
pser

2000-01-01   -0.957181
2000-01-02    0.929172
2000-01-03   -1.253638
2000-01-04   -1.170757
2000-01-05    1.535386
                ...   
2002-09-22   -0.322765
2002-09-23    0.267307
2002-09-24    0.610478
2002-09-25   -1.328949
2002-09-26   -0.705914
Freq: D, Length: 1000, dtype: float64

In [70]:
psser = ps.Series(pser)
psser

2000-01-01   -0.957181
2000-01-02    0.929172
2000-01-03   -1.253638
2000-01-04   -1.170757
2000-01-05    1.535386
2000-01-06   -0.200088
2000-01-07   -0.999740
2000-01-08    0.464952
2000-01-09   -0.704866
2000-01-10    0.879050
2000-01-11   -0.015021
2000-01-12    1.403835
2000-01-13   -0.414426
2000-01-14   -0.016961
2000-01-15    1.081794
2000-01-16   -0.040915
2000-01-17   -1.114839
2000-01-18    0.690579
2000-01-19    1.202980
2000-01-20    2.428248
2000-01-21    0.690652
2000-01-22   -0.424570
2000-01-23    1.462976
2000-01-24   -0.650520
2000-01-25   -0.311001
2000-01-26    1.130899
2000-01-27   -0.087385
2000-01-28    1.545177
2000-01-29   -2.041102
2000-01-30    1.609186
2000-01-31    1.402629
2000-02-01    0.933743
2000-02-02    1.053557
2000-02-03   -0.941330
2000-02-04   -0.284077
2000-02-05    0.169511
2000-02-06    0.113105
2000-02-07    0.613419
2000-02-08   -0.351787
2000-02-09    0.447919
2000-02-10   -0.110921
2000-02-11    1.901792
2000-02-12   -0.011471
2000-02-13 

In [71]:
psser = psser.cummax()

In [72]:
psser.plot()

25/10/23 10:09:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/10/23 10:09:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/10/23 10:09:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/10/23 10:09:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/10/23 10:09:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
