In [13]:
import pandas as pd
from datetime import datetime

# 假设的文件路径
file_path = 'bug_raw.csv'

try:
    # 读取CSV文件
    bug_raw = pd.read_csv(file_path,
                          sep='@@,,@@', engine='python', encoding='latin-1')
    
    # 将第一列转换为日期类型
    bug_raw.iloc[:, 0] = pd.to_datetime(bug_raw.iloc[:, 0], errors='coerce')

    # 统计行数和列数
    rows_count = bug_raw.shape[0]
    columns_count = bug_raw.shape[1]
    
    # 获取列标题（即每一列的名称）
    column_titles = bug_raw.columns.tolist()

    print(f"行数: {rows_count}")
    print(f"列数: {columns_count}")
    print("列标题:", column_titles)
except FileNotFoundError:
    print(f"文件未找到: {file_path}")
except Exception as e:
    print(f"读取文件时发生错误: {e}")


行数: 199960
列数: 5
列标题: ['when', 'bug_id', 'summary', 'description', 'who']


In [1]:
import pandas as pd

# 假设的文件路径
file_path = 'bug_raw.csv'

try:
    # 读取CSV文件，这里保持原有的分隔符
    bug_raw = pd.read_csv(file_path, sep='@@,,@@', engine='python', encoding='latin-1')
    
    # 计算每个人出现的次数
    who_counts = bug_raw['who'].value_counts()
    
    # 找出出现次数大于10的人名
    people_more_than_10 = who_counts[who_counts > 10].index
    
    # 筛选出这些人名对应的行
    filtered_rows = bug_raw[bug_raw['who'].isin(people_more_than_10)]
    
    # 将筛选后的结果写入新的CSV文件，这里使用逗号作为分隔符
    filtered_rows.to_csv('filtered_bug_raw_more_than10.csv', index=False, encoding='latin-1')

    # 显示筛选后的结果
    print("筛选后的结果：")
    print(filtered_rows)
except FileNotFoundError:
    print(f"文件未找到: {file_path}")
except Exception as e:
    print(f"读取文件时发生错误: {e}")


筛选后的结果：
                           when  bug_id  \
0       2001-10-19 16:36:00 EDT       1   
1       2002-04-30 16:30:46 EDT       2   
2       2002-05-08 16:18:21 EDT       3   
3       2002-03-01 16:27:31 EST       4   
4       2001-10-12 11:55:35 EDT      10   
...                         ...     ...   
199955  2011-09-20 09:14:51 EDT  357563   
199956  2011-10-20 15:56:02 EDT  357565   
199957  2011-09-14 14:16:09 EDT  357567   
199958  2011-09-16 13:53:35 EDT  357569   
199959  2011-10-26 17:44:22 EDT  357573   

                                                  summary  \
0         Usability issue with external editors (1GE6IRL)   
1       Opening repository resources doesn't honor typ...   
2               Sync does not indicate deletion (1GIEN83)   
3       need better error message if catching up over ...   
4                  API - VCM event notification (1G8G6RR)   
...                                                   ...   
199955   Out of bounds exception from System Mon

In [28]:
import pandas as pd

# 假设的文件路径
file_path = 'bug_raw.csv'

try:
    # 读取CSV文件
    bug_raw = pd.read_csv(file_path, sep='@@,,@@', engine='python', encoding='latin-1')
    
    # 计算每个人出现的次数
    who_counts = bug_raw['who'].value_counts()
    
    # 找出出现次数大于10且小于20的人名
    people_between_10_and_20 = who_counts[(who_counts > 10) & (who_counts < 13)].index
    
    # 筛选出这些人名对应的行
    filtered_rows = bug_raw[bug_raw['who'].isin(people_between_10_and_20)]
    
    # 将筛选后的结果写入新的CSV文件，这里使用逗号作为分隔符
    filtered_rows.to_csv('filtered_bug_raw_10_to_13.csv', index=False, encoding='latin-1')

    # 显示筛选后的结果
    print("筛选后的结果（出现次数大于10且小于13的人名）：")
    print(filtered_rows)
except FileNotFoundError:
    print(f"文件未找到: {file_path}")
except Exception as e:
    print(f"读取文件时发生错误: {e}")


筛选后的结果（出现次数大于10且小于20的人名）：
                           when  bug_id  \
335     2002-01-28 16:36:44 EST    1936   
668     2002-02-04 11:13:22 EST    2890   
679     2002-02-04 16:43:59 EST    2913   
1806    2002-01-31 12:03:40 EST    5147   
2089    2002-02-12 10:36:40 EST    5667   
...                         ...     ...   
199463  2011-09-10 11:24:40 EDT  356737   
199612  2011-09-23 09:49:25 EDT  356992   
199689  2011-09-23 09:58:48 EDT  357119   
199738  2011-10-05 09:42:08 EDT  357195   
199817  2013-03-19 07:23:02 EDT  357326   

                                                  summary  \
335     Problem when overriding Dialog.createDialogAre...   
668     EC - Area left by noDefaultAndApplyButton (1GL...   
679      Readme example prefs missing mnemonics (1GKM5TU)   
1806    [JFace] DialogSettings convenience methods do ...   
2089    TVT: Reset Defaults button on resources Proper...   
...                                                   ...   
199463  UnitFormatTest fails d

In [1]:
import pandas as pd

# 新文件的路径
new_file_path = './dataset/Eclipse_total.csv'

try:
    #我想统计行数
    # 读取新的CSV文件
    new_bug_raw = pd.read_csv(new_file_path, encoding='latin-1')
    #new_bug_raw = pd.read_csv(file_path, sep='@@,,@@', engine='python', encoding='latin-1')
    # 获取行数和列数
    nrows, ncols = new_bug_raw.shape
    unique_who_count = new_bug_raw['developer'].nunique()
    print(unique_who_count)
    # print(new_bug_raw['who'])
    # 打印行数和列数
    print(f"行数: {nrows}, 列数: {ncols}")
    # 计算每个人出现的次数
    who_counts = new_bug_raw['developer'].value_counts()
    people_morethan10 = who_counts[(who_counts > 10)].index
    # 筛选出这些人名对应的行
    filtered_rows = new_bug_raw[new_bug_raw['developer'].isin(people_morethan10)]
    # 将筛选后的结果写入新的CSV文件，这里使用逗号作为分隔符
    filtered_rows.to_csv('./dataset/Eclipse_total_morethan10.csv', index=False, encoding='latin-1')
    # 显示筛选后的结果
    print("筛选后的结果出现次数大于10：")
    print(filtered_rows)
except FileNotFoundError:
    print(f"文件未找到: {file_path}")
except Exception as e:
    print(f"读取文件时发生错误: {e}")

771
行数: 41830, 列数: 10
筛选后的结果出现次数大于10：
       bug_id    product                                          abstracts  \
0      221036  Community                 unabl enter request parti code cq    
1      221203  Community                       submit request cq cq submit    
2      243952  Community     ca submit cq rd parti contribut develop apach    
3      230229  Community                                 tool link fail ie    
4      229095  Community                                 cron total broken    
...       ...        ...                                                ...   
41825   21947        PDE                       extens templat nature build    
41826  222945        PDE                    ca forc version instal plug in    
41827    5729        PDE       pde project wizard default src folder soruc    
41828    5732        PDE      disabl ad sourc folder runtim librari select    
41829    5727        PDE  distinguish develop runtim workbench prferenc ...   

             

统计行数30s 也就是这个文件的读取时间大概会花费30s 花费不多 行数一共是199960行


