## Dataframe and Series

### Series

In [36]:
import numpy as np
import pandas as pd

# Series based on array
series_from_list = pd.Series(data=[1, 2, 3, 4, 5.3],
                             index=['row_1', 'row_2', 'row_3', 'row_4', 'row_5'],
                             dtype='string',
                             name='pd_Series'
                             )

print(series_from_list)


row_1      1
row_2      2
row_3      3
row_4      4
row_5    5.3
Name: pd_Series, dtype: string


In [13]:
dict_array = {
    'row_1': 2,
    'row_2': 3,
    'row_3': 4,
    'row_4': 5,
    'row_5': 6
}

# Series based on dict
series_from_dict = pd.Series(dict_array,
                             index=['row_3', 'row_4', 'row_100'],
                             dtype=int,
                             name='pd_Series'
                             )
print(series_from_dict)


row_3      4.0
row_4      5.0
row_100    NaN
Name: pd_Series, dtype: float64


In [None]:
# Copy Series
np_array = np.arange(1, 21)
series_from_np_array = pd.Series(np_array,
                                 copy=False
                                 )

series_from_np_array.iloc[3] = 100

print("Original: ", np_array)

np_array_2 = np.arange(1, 16)
series_from_np_array_2 = pd.Series(np_array_2,
                                   copy=True)

series_from_np_array_2.iloc[3] = 200

print("Copy", np_array_2)



Original:  [  1   2   3 100   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20]
Copy [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]


In [19]:
# Series based on other series
series_list = pd.Series(data=[1, 2, 3, 4, 5],
                        index=['row_1', 'row_2', 'row_3', 'row_4', 'row_5'],
                        dtype='string',
                        name='pd_Series_1'
                        )

series_from_series = pd.Series(data=series_list,
                               index=['row_2', 'row_5', 'row_1'])

print(series_from_series)



row_2    2
row_5    5
row_1    1
Name: pd_Series_1, dtype: string


### DataFrame

In [25]:
dict_array = {
    'col1': [1, 2, 3, 4],
    'col2': [5, 6, 7, 8],
    'col3': [9, 10, 11, 12],
    'col4': [13, 14, 15, 16]
}
# DataFrame based on dict
dateframe_from_dict = pd.DataFrame(
    data=dict_array,
    index=['row_1', 'row_2', 'row_3', 'row_4'],
    columns=['col3', 'col4', 'col1'],
    dtype=float
)

print(dateframe_from_dict)

       col3  col4  col1
row_1   9.0  13.0   1.0
row_2  10.0  14.0   2.0
row_3  11.0  15.0   3.0
row_4  12.0  16.0   4.0


In [28]:
# DataFrame based on numpy array WITH COPY PARAM
np_array = [
    [1, 2, 3, 4, 5],
    [6, 7, 8, 9, 10],
    [11, 12, 13, 14, 15],
    [16, 17, 18, 19, 20]
]

# DateFrame based on numpy array
df_1 = pd.DataFrame(data=np_array,
                    copy=True)
df_1.iloc[:, 2] = 100

print(df_1)
print(np_array)

    0   1    2   3   4
0   1   2  100   4   5
1   6   7  100   9  10
2  11  12  100  14  15
3  16  17  100  19  20
[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15], [16, 17, 18, 19, 20]]


In [30]:
dict_array = {
    'row_1': 2,
    'row_2': 3,
    'row_3': 4,
    'row_4': 5,
    'row_5': 6
}

array = np.array([1, 2, 3, 4, 5])

series_1 = pd.Series(dict_array)
series_2 = pd.Series(array, index=['row_3', 'row_4', 'row_5', 'row_6', 'row_9'])

df = pd.DataFrame(data={'col1': series_1, 'col2': series_2},
                  index=['row_1', 'row_7'])

print(df)

       col1  col2
row_1   2.0   NaN
row_7   NaN   NaN


## Read and Write to Files

In [None]:
# With default sep ,
df_1 = pd.read_csv('content/avito_data.csv')
print(df_1)


        user_id            city               category_name    price  \
0  dbe73ad6e4b5       Волгоград      Детская одежда и обувь      NaN   
1  2e11806abe57     Нижняя Тура                  Велосипеды   3000.0   
2  0b850bbebb10          Бердск               Аудио и видео  15000.0   
3  5f1d5c3ce0da         Саратов             Бытовая техника   4500.0   
4  23e2d97bfc7f         Бузулук  Товары для детей и игрушки   4900.0   
5  c2a632af2602  Ростов-на-Дону      Ремонт и строительство    500.0   
6  b239811ad530        Оренбург                    Ноутбуки  20990.0   
7  d85fa02e6341     Калининград                    Телефоны    990.0   
8  ae6586719bec     Новосибирск       Товары для компьютера   1200.0   
9  30ad26d633ef       Полесской      Детская одежда и обувь    400.0   

  activation_date user_type  year  month  day  
0      2017-04-18   Private  2017      4   18  
1      2017-04-16   Private  2017      4   16  
2      2017-04-17   Private  2017      4   17  
3      2017-04-

In [35]:
# With sep ;
df_2 = pd.read_csv('content/avito_sep.csv',
                   sep=';')
print(df_2)

        user_id            city               category_name    price  \
0  dbe73ad6e4b5       Волгоград      Детская одежда и обувь      NaN   
1  2e11806abe57     Нижняя Тура                  Велосипеды   3000.0   
2  0b850bbebb10          Бердск               Аудио и видео  15000.0   
3   5f1d53ce0da         Саратов             Бытовая техника   4500.0   
4  23e2d97bfc7f         Бузулук  Товары для детей и игрушки   4900.0   
5  c2a632af2602  Ростов-на-Дону      Ремонт и строительство    500.0   
6  b239811ad530        Оренбург                    Ноутбуки  20990.0   
7  d85fa02e6341     Калининград                    Телефоны    990.0   
8  ae6586719bec     Новосибирск       Товары для компьютера   1200.0   
9  30ad26d633ef       Полевской      Детская одежда и обувь    400.0   

  activation_date user_type  year  month  day  
0      2017-04-18   Private  2017      4   18  
1      2017-04-16   Private  2017      4   16  
2      2017-04-17   Private  2017      4   17  
3      2017-04-

In [41]:
import pandas as pd

df_index_col = pd.read_csv('content/avito_sep.csv',
                           sep=';',
                           index_col=['user_type', 'city']
                           )

print(df_index_col)

                               user_id               category_name    price  \
user_type city                                                                
Private   Волгоград       dbe73ad6e4b5      Детская одежда и обувь      NaN   
          Нижняя Тура     2e11806abe57                  Велосипеды   3000.0   
          Бердск          0b850bbebb10               Аудио и видео  15000.0   
          Саратов          5f1d53ce0da             Бытовая техника   4500.0   
          Бузулук         23e2d97bfc7f  Товары для детей и игрушки   4900.0   
          Ростов-на-Дону  c2a632af2602      Ремонт и строительство    500.0   
Shop      Оренбург        b239811ad530                    Ноутбуки  20990.0   
          Калининград     d85fa02e6341                    Телефоны    990.0   
Company   Новосибирск     ae6586719bec       Товары для компьютера   1200.0   
Private   Полевской       30ad26d633ef      Детская одежда и обувь    400.0   

                         activation_date  year  mon

In [43]:
df_usecols = pd.read_csv('content/avito_sep.csv',
                         sep=';',
                         index_col=['city'],
                         usecols=['user_id', 'city', 'price', 'activation_date']
                         )

print(df_usecols)

                     user_id    price activation_date
city                                                 
Волгоград       dbe73ad6e4b5      NaN      2017-04-18
Нижняя Тура     2e11806abe57   3000.0      2017-04-16
Бердск          0b850bbebb10  15000.0      2017-04-17
Саратов          5f1d53ce0da   4500.0      2017-04-17
Бузулук         23e2d97bfc7f   4900.0      2017-04-15
Ростов-на-Дону  c2a632af2602    500.0      2017-04-12
Оренбург        b239811ad530  20990.0      2017-04-17
Калининград     d85fa02e6341    990.0      2017-04-18
Новосибирск     ae6586719bec   1200.0      2017-04-18
Полевской       30ad26d633ef    400.0      2017-04-12


In [46]:
# test argument Squeeze - return Series if only one column is parsed
df_squeeze = pd.read_csv('content/avito_sep.csv',
                         sep=';',
                         usecols=['city'],
                         )

df_squeeze_city = df_squeeze['city']
print(df_squeeze)

             city
0       Волгоград
1     Нижняя Тура
2          Бердск
3         Саратов
4         Бузулук
5  Ростов-на-Дону
6        Оренбург
7     Калининград
8     Новосибирск
9       Полевской


In [49]:
df = pd.read_csv('content/avito_sep.csv',
                 sep=';',
                 usecols=['user_id', 'city', 'price', 'activation_date'],
                 )

df.info()

df1 = pd.read_csv('content/avito_sep.csv',
                  sep=';',
                  usecols=['user_id', 'city', 'price', 'activation_date'],
                  dtype={'city': 'category', 'user_id': 'string'}
                  )

df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   user_id          10 non-null     object 
 1   city             10 non-null     object 
 2   price            9 non-null      float64
 3   activation_date  10 non-null     object 
dtypes: float64(1), object(3)
memory usage: 452.0+ bytes
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   user_id          10 non-null     string  
 1   city             10 non-null     category
 2   price            9 non-null      float64 
 3   activation_date  10 non-null     object  
dtypes: category(1), float64(1), object(1), string(1)
memory usage: 762.0+ bytes


In [51]:
# Argument n_rows
df_nrows = pd.read_csv('content/avito_sep.csv',
                       sep=';',
                       nrows=5)
print(df_nrows)

        user_id         city               category_name    price  \
0  dbe73ad6e4b5    Волгоград      Детская одежда и обувь      NaN   
1  2e11806abe57  Нижняя Тура                  Велосипеды   3000.0   
2  0b850bbebb10       Бердск               Аудио и видео  15000.0   
3   5f1d53ce0da      Саратов             Бытовая техника   4500.0   
4  23e2d97bfc7f      Бузулук  Товары для детей и игрушки   4900.0   

  activation_date user_type  year  month  day  
0      2017-04-18   Private  2017      4   18  
1      2017-04-16   Private  2017      4   16  
2      2017-04-17   Private  2017      4   17  
3      2017-04-17   Private  2017      4   17  
4      2017-04-15   Private  2017      4   15  


In [None]:
# Arguments parse_dates=...
df_parse_dates = pd.read_csv('content/avito_sep.csv',
                             sep=';',
                             parse_dates=['activation_date', 'price', 'category_name'])

df_parse_dates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   user_id          10 non-null     object        
 1   city             10 non-null     object        
 2   category_name    10 non-null     object        
 3   price            9 non-null      object        
 4   activation_date  10 non-null     datetime64[ns]
 5   user_type        10 non-null     object        
 6   year             10 non-null     int64         
 7   month            10 non-null     int64         
 8   day              10 non-null     int64         
dtypes: datetime64[ns](1), int64(3), object(5)
memory usage: 852.0+ bytes


  df_parse_dates = pd.read_csv('content/avito_sep.csv',
  df_parse_dates = pd.read_csv('content/avito_sep.csv',


In [62]:
df_parse_dates = pd.read_csv('content/avito_sep.csv',
                             sep=';',
                            parse_dates=[['year', 'month', 'day'], 'activation_date', 'category_name']
)

print(df_parse_dates.info())
print(df_parse_dates)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   year_month_day   10 non-null     datetime64[ns]
 1   user_id          10 non-null     object        
 2   city             10 non-null     object        
 3   category_name    10 non-null     object        
 4   price            9 non-null      float64       
 5   activation_date  10 non-null     datetime64[ns]
 6   user_type        10 non-null     object        
dtypes: datetime64[ns](2), float64(1), object(4)
memory usage: 692.0+ bytes
None
  year_month_day       user_id            city               category_name  \
0     2017-04-18  dbe73ad6e4b5       Волгоград      Детская одежда и обувь   
1     2017-04-16  2e11806abe57     Нижняя Тура                  Велосипеды   
2     2017-04-17  0b850bbebb10          Бердск               Аудио и видео   
3     2017-04-17   5f1d53ce0d

  df_parse_dates = pd.read_csv('content/avito_sep.csv',
  df_parse_dates = pd.read_csv('content/avito_sep.csv',


In [66]:
df_parse_dates1 = pd.read_csv('content/avito_sep.csv',
                             sep=';',
                            parse_dates={
                                'data0': ['activation_date'],
                                'data1': ['year', 'month', 'day']
                                }
)

print(df_parse_dates.info())
print(df_parse_dates1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   year_month_day   10 non-null     datetime64[ns]
 1   user_id          10 non-null     object        
 2   city             10 non-null     object        
 3   category_name    10 non-null     object        
 4   price            9 non-null      float64       
 5   activation_date  10 non-null     datetime64[ns]
 6   user_type        10 non-null     object        
dtypes: datetime64[ns](2), float64(1), object(4)
memory usage: 692.0+ bytes
None
       data0      data1       user_id            city  \
0 2017-04-18 2017-04-18  dbe73ad6e4b5       Волгоград   
1 2017-04-16 2017-04-16  2e11806abe57     Нижняя Тура   
2 2017-04-17 2017-04-17  0b850bbebb10          Бердск   
3 2017-04-17 2017-04-17   5f1d53ce0da         Саратов   
4 2017-04-15 2017-04-15  23e2d97bfc7f         Бузулук   

  df_parse_dates1 = pd.read_csv('content/avito_sep.csv',


In [77]:
# Argument keep_date_col and encoding
df_parse_dates1 = pd.read_csv('content/avito_sep.csv',
                             sep=';',
                             keep_date_col=True,
                            parse_dates={
                                'data1': ['year', 'month', 'day'],
                                },
                            encoding='utf8'
)

print(df_parse_dates1)

       data1       user_id            city               category_name  \
0 2017-04-18  dbe73ad6e4b5       Волгоград      Детская одежда и обувь   
1 2017-04-16  2e11806abe57     Нижняя Тура                  Велосипеды   
2 2017-04-17  0b850bbebb10          Бердск               Аудио и видео   
3 2017-04-17   5f1d53ce0da         Саратов             Бытовая техника   
4 2017-04-15  23e2d97bfc7f         Бузулук  Товары для детей и игрушки   
5 2017-04-12  c2a632af2602  Ростов-на-Дону      Ремонт и строительство   
6 2017-04-17  b239811ad530        Оренбург                    Ноутбуки   
7 2017-04-18  d85fa02e6341     Калининград                    Телефоны   
8 2017-04-18  ae6586719bec     Новосибирск       Товары для компьютера   
9 2017-04-12  30ad26d633ef       Полевской      Детская одежда и обувь   

     price activation_date user_type  year month day  
0      NaN      2017-04-18   Private  2017     4  18  
1   3000.0      2017-04-16   Private  2017     4  16  
2  15000.0      2017

  df_parse_dates1 = pd.read_csv('content/avito_sep.csv',
  df_parse_dates1 = pd.read_csv('content/avito_sep.csv',


### Write to CSV

In [83]:
df = pd.read_csv('content/avito_data.csv')

# Save file
df.to_csv('content/avito_copy.csv', sep=';')

# Save columns
df.to_csv('content/avito_copy.csv',
          sep=';',
          columns=['city', 'price']
          )

# Argument header
df.to_csv('content/avito_copy.csv',
          sep=';',
          columns=['city', 'price'],
          header=['city_copy', 'price_copy']
          )

#Argument index
df.to_csv('content/avito_copy.csv',
          sep=';',
          index=True
          )